[llvm] c931f2e - [AMDGPU] Autogenerate & pre-commit tests for D156301 and D157388

Pravin Jagtap via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 18 06:51:18 PDT 2023


Author: Pravin Jagtap
Date: 2023-08-18T09:50:44-04:00
New Revision: c931f2e6fd0c5a84707291f731e3c4b68d88de30

URL: https://github.com/llvm/llvm-project/commit/c931f2e6fd0c5a84707291f731e3c4b68d88de30
DIFF: https://github.com/llvm/llvm-project/commit/c931f2e6fd0c5a84707291f731e3c4b68d88de30.diff

LOG: [AMDGPU] Autogenerate & pre-commit tests for D156301 and D157388

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D157712

Added: 
    llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
    llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll

Modified: 
    llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
    llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
new file mode 100644
index 00000000000000..79c3d449b53ed6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+
+define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+
+define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+  ret float %result
+}
+
+
+define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+
+define amdgpu_ps float @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+  ret float %result
+}
+
+define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-NEXT:    ret float [[RESULT]]
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+  ret float %result
+}
+
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #2 = { strictfp }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
new file mode 100644
index 00000000000000..c087f2373d57aa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+
+define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+
+define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, float %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_system_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_uni_address_div_value_system_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+  ret void
+}
+
+
+define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+
+define amdgpu_ps void @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 {
+; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+  ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-ITERATIVE-NEXT:    ret void
+;
+; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-NEXT:    ret void
+;
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+  ret void
+}
+
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #2 = { strictfp }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
new file mode 100644
index 00000000000000..c7262c37626364
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -0,0 +1,3246 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+
+declare float @div.float.value()
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_atomic_add_f32 v40, v0, s[34:35]
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, 4.0, v1
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
+; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4
+  ret void
+}
+
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #2 = { strictfp}

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
new file mode 100644
index 00000000000000..2cc59a16e4ecd5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -0,0 +1,2338 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+
+declare float @div.float.value()
+
+define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4
+  ret void
+}
+
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
new file mode 100644
index 00000000000000..455a8dca101685
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -0,0 +1,2338 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+
+declare float @div.float.value()
+
+define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v0
+; GFX7LESS-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX9-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX9-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v1, v40, s[34:35]
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v40, v[0:1], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v1, v40, s[34:35]
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v0, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v40, v[0:1], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4
+  ret void
+}
+
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
new file mode 100644
index 00000000000000..ffae7980a9d498
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -0,0 +1,3445 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+
+declare float @div.float.value()
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_add_f32_e32 v0, -4.0, v1
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
+  ret void
+}
+
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX9-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_subrev_f32_e32 v0, 4.0, v1
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
+; GFX7LESS-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX7LESS:       ; %bb.0:
+; GFX7LESS-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT:    s_mov_b32 s14, s10
+; GFX7LESS-NEXT:    s_mov_b32 s13, s9
+; GFX7LESS-NEXT:    s_mov_b32 s12, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
+; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT:    buffer_load_dword v2, off, s[36:39], 0
+; GFX7LESS-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7LESS-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7LESS-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT:    s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s10
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX1064:       ; %bb.0:
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-NEXT:    s_mov_b32 s13, s9
+; GFX1064-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s10
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s32, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1064-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT:    s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX1032:       ; %bb.0:
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-NEXT:    s_mov_b32 s13, s9
+; GFX1032-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s10
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s32, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1032-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT:    s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX1164:       ; %bb.0:
+; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-NEXT:    s_mov_b32 s13, s9
+; GFX1164-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-NEXT:    s_mov_b32 s14, s10
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s32, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT:    s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX1132:       ; %bb.0:
+; GFX1132-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-NEXT:    s_mov_b32 s13, s14
+; GFX1132-NEXT:    s_mov_b32 s14, s15
+; GFX1132-NEXT:    s_mov_b32 s32, 0
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT:    s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX9-DPP:       ; %bb.0:
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX9-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT:    s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX1064-DPP:       ; %bb.0:
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1064-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1064-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1064-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1064-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1064-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT:    s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX1032-DPP:       ; %bb.0:
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1032-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1032-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT:    global_load_dword v2, v40, s[34:35]
+; GFX1032-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1032-DPP-NEXT:    global_atomic_cmpswap v1, v40, v[1:2], s[34:35] glc
+; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1032-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1032-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT:    s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX1164-DPP:       ; %bb.0:
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX1164-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1164-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1164-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1164-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT:    s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
+; GFX1132-DPP:       ; %bb.0:
+; GFX1132-DPP-NEXT:    s_add_u32 s8, s4, 44
+; GFX1132-DPP-NEXT:    s_addc_u32 s9, s5, 0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[6:7]
+; GFX1132-DPP-NEXT:    s_add_u32 s6, s6, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s7, s7, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[6:7], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    global_load_b32 v2, v40, s[34:35]
+; GFX1132-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v1, v40, v[1:2], s[34:35] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX1132-DPP-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX1132-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT:    s_endpgm
+  %divValue = call float @div.float.value()
+  %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue monotonic, align 4
+  ret void
+}
+
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #2 = { strictfp}
+

diff  --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 1739612176fcf2..35860bcc23e937 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1,42 +1,334 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX678,HAS-ATOMICS %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,HAS-ATOMICS %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX678,NO-ATOMICS %s
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX678,NO-ATOMICS %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 
-; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f32:
-; GFX678-DAG: s_mov_b32 m0
-; GFX9-NOT: m0
-; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0
-; HAS-ATOMICS: ds_add_rtn_f32 v0, v0, [[K]]
-
-; NO-ATOMICS: ds_read_b32
-; NO-ATOMICS: v_add_f32
-; NO-ATOMICS: ds_cmpst_rtn_b32
-; NO-ATOMICS: s_cbranch_execnz
 define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
+; VI-LABEL: lds_atomic_fadd_ret_f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, 4.0
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_add_rtn_f32 v0, v0, v1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fadd_ret_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fadd_ret_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fadd_ret_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v1, v0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, 4.0, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst
   ret float %result
 }
 
-; GCN-LABEL: {{^}}lds_atomic_fadd_noret_f32:
-; GFX678-DAG: s_mov_b32 m0
-; GFX9-NOT: m0
-; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0
-; HAS-ATOMICS: ds_add_f32 v0, [[K]]
 define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
+; VI-LABEL: lds_atomic_fadd_noret_f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, 4.0
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_add_f32 v0, v1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fadd_noret_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_add_f32 v0, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fadd_noret_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b32 v1, v0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v2, 4.0, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fadd_noret_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v1, v0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v2, 4.0, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_ds_fadd:
-; VI-DAG: s_mov_b32 m0
-; GFX9-NOT: m0
-; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
-; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
-; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
-; HAS-ATOMICS: s_waitcnt lgkmcnt(0)
-; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
 define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) {
+; VI-LABEL: lds_ds_fadd:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshl_b32 s4, s3, 3
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_add_rtn_f32 v1, v1, v0 offset:32
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshl_b32 s3, s3, 4
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_add_f32 v2, v0 offset:64
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_add_rtn_f32 v2, v0, v1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: lds_ds_fadd:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x42280000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b32 s4, s3, 3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_add_rtn_f32 v1, v1, v0 offset:32
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_add_f32 v2, v0 offset:64
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX7-LABEL: lds_ds_fadd:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b32 s4, s3, 3
+; GFX7-NEXT:    s_add_i32 s4, s4, 32
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_add_i32 s3, s3, 4
+; GFX7-NEXT:    s_lshl_b32 s6, s3, 3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB2_3: ; %atomicrmw.start2
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB2_3
+; GFX7-NEXT:  ; %bb.4: ; %atomicrmw.end1
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB2_5: ; %atomicrmw.start8
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_add_f32_e32 v3, v2, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v1, v2, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB2_5
+; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end7
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: lds_ds_fadd:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b32 s4, s3, 3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    ds_read_b32 v0, v0 offset:32
+; GFX8-NEXT:    s_add_i32 s3, s3, 4
+; GFX8-NEXT:    s_lshl_b32 s6, s3, 3
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    ds_read_b32 v1, v1
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB2_3: ; %atomicrmw.start2
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB2_3
+; GFX8-NEXT:  ; %bb.4: ; %atomicrmw.end1
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    ds_read_b32 v1, v1
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB2_5: ; %atomicrmw.start8
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_add_f32_e32 v3, v2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v1, v2, v3
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB2_5
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end7
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
@@ -49,15 +341,181 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_ds_fadd_one_as:
-; VI-DAG: s_mov_b32 m0
-; GFX9-NOT: m0
-; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000
-; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32
-; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64
-; HAS-ATOMICS: s_waitcnt lgkmcnt(1)
-; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]]
 define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspace(3) %ptrf, i32 %idx) {
+; VI-LABEL: lds_ds_fadd_one_as:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x42280000
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshl_b32 s4, s3, 3
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    ds_add_rtn_f32 v1, v1, v0 offset:32
+; VI-NEXT:    s_lshl_b32 s3, s3, 4
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    ds_add_f32 v2, v0 offset:64
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NEXT:    ds_add_rtn_f32 v2, v0, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: lds_ds_fadd_one_as:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x42280000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b32 s4, s3, 3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    ds_add_rtn_f32 v1, v1, v0 offset:32
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    ds_add_f32 v2, v0 offset:64
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX7-LABEL: lds_ds_fadd_one_as:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b32 s4, s3, 3
+; GFX7-NEXT:    s_add_i32 s4, s4, 32
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_add_i32 s3, s3, 4
+; GFX7-NEXT:    s_lshl_b32 s6, s3, 3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB3_3: ; %atomicrmw.start2
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_3
+; GFX7-NEXT:  ; %bb.4: ; %atomicrmw.end1
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    ds_read_b32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB3_5: ; %atomicrmw.start8
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_add_f32_e32 v3, v2, v0
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v1, v1, v2, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_5
+; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end7
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: lds_ds_fadd_one_as:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b32 s4, s3, 3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    ds_read_b32 v0, v0 offset:32
+; GFX8-NEXT:    s_add_i32 s3, s3, 4
+; GFX8-NEXT:    s_lshl_b32 s6, s3, 3
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    ds_read_b32 v1, v1
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB3_3: ; %atomicrmw.start2
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_add_f32_e32 v2, 0x42280000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v3, v1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB3_3
+; GFX8-NEXT:  ; %bb.4: ; %atomicrmw.end1
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    ds_read_b32 v1, v1
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB3_5: ; %atomicrmw.start8
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_add_f32_e32 v3, v2, v0
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v1, v1, v2, v3
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB3_5
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end7
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %idx.add = add nuw i32 %idx, 4
   %shl0 = shl i32 %idx.add, 3
   %shl1 = shl i32 %idx.add, 4
@@ -70,61 +528,580 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f64:
-; GCN: ds_read_b64
-; GCN: v_add_f64
-; GCN: ds_cmpst_rtn_b64
-; GCN: s_cbranch_execnz
 define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
+; VI-LABEL: lds_atomic_fadd_ret_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB4_1: ; %atomicrmw.start
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v4, v1
+; VI-NEXT:    v_mov_b32_e32 v3, v0
+; VI-NEXT:    v_add_f64 v[0:1], v[3:4], 4.0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB4_1
+; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fadd_ret_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_add_f64 v[0:1], v[3:4], 4.0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fadd_ret_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    ds_read_b64 v[0:1], v0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7-NEXT:    v_add_f64 v[0:1], v[3:4], 4.0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fadd_ret_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    ds_read_b64 v[0:1], v0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v0
+; GFX8-NEXT:    v_add_f64 v[0:1], v[3:4], 4.0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret double %result
 }
 
-; GCN-LABEL: {{^}}lds_atomic_fadd_noret_f64:
-; GCN: ds_read_b64
-; GCN: v_add_f64
-; GCN: ds_cmpst_rtn_b64
-; GCN: s_cbranch_execnz
 define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
+; VI-LABEL: lds_atomic_fadd_noret_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_read_b64 v[1:2], v0
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB5_1: ; %atomicrmw.start
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v1, v3
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, v4
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB5_1
+; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fadd_noret_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b64 v[1:2], v0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fadd_noret_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b64 v[1:2], v0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fadd_noret_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b64 v[1:2], v0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_fsub_ret_f32:
-; GCN: ds_read_b32
-; GCN: v_sub_f32
-; GCN: ds_cmpst_rtn_b32
-; GCN: s_cbranch_execnz
 define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwind {
+; VI-LABEL: lds_atomic_fsub_ret_f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_read_b32 v2, v0
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB6_1: ; %atomicrmw.start
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    v_sub_f32_e32 v2, v3, v1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB6_1
+; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fsub_ret_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v2, v0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fsub_ret_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b32 v2, v0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, v2
+; GFX7-NEXT:    v_sub_f32_e32 v2, v3, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fsub_ret_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst
   ret float %result
 }
 
-; GCN-LABEL: {{^}}lds_atomic_fsub_noret_f32:
-; GCN: ds_read_b32
-; GCN: v_sub_f32
-; GCN: ds_cmpst_rtn_b32
 define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwind {
+; VI-LABEL: lds_atomic_fsub_noret_f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_read_b32 v2, v0
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB7_1: ; %atomicrmw.start
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_sub_f32_e32 v3, v2, v1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v2, v3
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB7_1
+; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fsub_noret_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v2, v0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_sub_f32_e32 v3, v2, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fsub_noret_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b32 v2, v0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_sub_f32_e32 v3, v2, v1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fsub_noret_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_sub_f32_e32 v3, v2, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst
   ret void
 }
 
-; GCN-LABEL: {{^}}lds_atomic_fsub_ret_f64:
-; GCN: ds_read_b64
-; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
-; GCN: ds_cmpst_rtn_b64
-
 define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounwind {
+; VI-LABEL: lds_atomic_fsub_ret_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_read_b64 v[3:4], v0
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB8_1: ; %atomicrmw.start
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_mov_b32_e32 v5, v3
+; VI-NEXT:    v_add_f64 v[3:4], v[5:6], -v[1:2]
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4]
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB8_1
+; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, v3
+; VI-NEXT:    v_mov_b32_e32 v1, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fsub_ret_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b64 v[3:4], v0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-NEXT:    v_add_f64 v[3:4], v[5:6], -v[1:2]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fsub_ret_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b64 v[3:4], v0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_add_f64 v[3:4], v[5:6], -v[1:2]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fsub_ret_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b64 v[3:4], v0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_add_f64 v[3:4], v[5:6], -v[1:2]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst
   ret double %result
 }
 
-; GCN-LABEL: {{^}}lds_atomic_fsub_noret_f64:
-; GCN: ds_read_b64
-; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
-; GCN: ds_cmpst_rtn_b64
-; GCN: s_cbranch_execnz
 define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounwind {
+; VI-LABEL: lds_atomic_fsub_noret_f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_read_b64 v[3:4], v0
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB9_1: ; %atomicrmw.start
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_add_f64 v[5:6], v[3:4], -v[1:2]
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6]
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; VI-NEXT:    v_mov_b32_e32 v3, v5
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v4, v6
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB9_1
+; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lds_atomic_fsub_noret_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b64 v[3:4], v0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_add_f64 v[5:6], v[3:4], -v[1:2]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: lds_atomic_fsub_noret_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b64 v[3:4], v0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_add_f64 v[5:6], v[3:4], -v[1:2]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: lds_atomic_fsub_noret_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b64 v[3:4], v0
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_add_f64 v[5:6], v[3:4], -v[1:2]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v6
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
index 46ea7266f28254..ec73b48c02ef4f 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
@@ -1,13 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}shl_base_atomicrmw_global_ptr:
-; GCN-DAG: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4
-; GCN-DAG: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc
-; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5]
-; GCN-DAG: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
-; GCN-DAG: global_atomic_and v[[[LO]]:[[HI]]], [[THREE]], off offset:512
-; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
 define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
+; GCN-LABEL: shl_base_atomicrmw_global_ptr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b64 v[0:1], 2, v[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v6, 3
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    global_atomic_and v[0:1], v6, off offset:512
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_wbinvl1_vol
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 0x80, v4
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GCN-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
   %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
   %shl = shl i64 %cast, 2
@@ -17,14 +25,18 @@ define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(
   ret void
 }
 
-; GCN-LABEL: {{^}}shl_base_global_ptr_global_atomic_fadd:
-; GCN-DAG: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4
-; GCN-DAG: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc
-; GCN-DAG: v_lshlrev_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], 2, v[4:5]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000
-; GCN-DAG: global_atomic_add_f32 v[[[LO]]:[[HI]]], [[K]], off offset:512
-; GCN-DAG: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[EXTRA_LO]]:[[EXTRA_HI]]]
 define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
+; GCN-LABEL: shl_base_global_ptr_global_atomic_fadd:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshlrev_b64 v[0:1], 2, v[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x42c80000
+; GCN-NEXT:    global_atomic_add_f32 v[0:1], v6, off offset:512
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 0x80, v4
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GCN-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
   %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
   %shl = shl i64 %cast, 2


        


More information about the llvm-commits mailing list