[clang] [llvm] AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering (PR #96759)

Matt Arsenault via cfe-commits cfe-commits at lists.llvm.org
Tue Jul 16 12:14:09 PDT 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/96759

>From 3e0884a1f2f8854ab5f45a7b2ebae3812f5b536e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 23 Jun 2024 11:16:22 +0200
Subject: [PATCH] AMDGPU: Handle remote/fine-grained memory in atomicrmw
 fmin/fmax lowering

Consider the new atomic metadata when choosing to expand as cmpxchg
instead.
---
 clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu   |    8 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   83 +-
 .../AMDGPU/GlobalISel/atomicrmw_fmax.ll       | 1003 +--
 .../AMDGPU/GlobalISel/atomicrmw_fmin.ll       | 1003 +--
 .../buffer-fat-pointer-atomicrmw-fmax.ll      | 3472 +++++---
 .../buffer-fat-pointer-atomicrmw-fmin.ll      | 3472 +++++---
 .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll     |  446 +-
 .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll     |  446 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll   | 4793 ++++++-----
 .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll   | 4793 ++++++-----
 .../AMDGPU/global_atomics_scan_fmax.ll        | 7412 +++++------------
 .../AMDGPU/global_atomics_scan_fmin.ll        | 7412 +++++------------
 .../AMDGPU/expand-atomic-f32-agent.ll         | 2052 ++++-
 .../AMDGPU/expand-atomic-f32-system.ll        | 1496 +++-
 .../AMDGPU/expand-atomic-f64-agent.ll         | 1016 ++-
 .../AMDGPU/expand-atomic-f64-system.ll        |  738 +-
 16 files changed, 19189 insertions(+), 20456 deletions(-)

diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index eeb23bc7e1c01..ea2827f0c9950 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -57,8 +57,8 @@ __global__ void ffp2(double *p) {
   // SAFE: global_atomic_cmpswap_b64
   // UNSAFE-LABEL: @_Z4ffp2Pd
   // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
+  // UNSAFE: global_atomic_max_f64
+  // UNSAFE: global_atomic_min_f64
   // UNSAFE: global_atomic_max_f64
   // UNSAFE: global_atomic_min_f64
   __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
@@ -84,8 +84,8 @@ __global__ void ffp3(long double *p) {
   // SAFE: global_atomic_cmpswap_b64
   // UNSAFE-LABEL: @_Z4ffp3Pe
   // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
-  // UNSAFE: global_atomic_cmpswap_x2
+  // UNSAFE: global_atomic_max_f64
+  // UNSAFE: global_atomic_min_f64
   // UNSAFE: global_atomic_max_f64
   // UNSAFE: global_atomic_min_f64
   __atomic_fetch_sub(p, 1.0L, memory_order_relaxed);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bb8e21772e566..55dc987630dc7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16110,6 +16110,34 @@ static bool isBFloat2(Type *Ty) {
   return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
 }
 
+/// \returns true if it's valid to emit a native instruction for \p RMW, based
+/// on the properties of the target memory.
+static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
+                                        const AtomicRMWInst *RMW,
+                                        bool HasSystemScope) {
+  // The remote/fine-grained access logic is different from the integer
+  // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
+  // fine-grained access does not work, even for a device local allocation.
+  //
+  // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
+  // allocations work.
+  if (HasSystemScope) {
+    if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
+        RMW->hasMetadata("amdgpu.no.remote.memory"))
+      return true;
+  } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+    return true;
+
+  if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
+    return true;
+
+  // TODO: Auto-upgrade this attribute to the metadata in function body and stop
+  // checking it.
+  return RMW->getFunction()
+      ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+      .getValueAsBool();
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
@@ -16260,37 +16288,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
     Type *Ty = RMW->getType();
 
     // LDS float and double fmin/fmax were always supported.
-    if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
-      return AtomicExpansionKind::None;
-
-    if (unsafeFPAtomicsDisabled(RMW->getFunction()))
-      return AtomicExpansionKind::CmpXChg;
-
-    // Always expand system scope fp atomics.
-    if (HasSystemScope)
-      return AtomicExpansionKind::CmpXChg;
+    if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+      return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
+                                                 : AtomicExpansionKind::CmpXChg;
+    }
 
-    // For flat and global cases:
-    // float, double in gfx7. Manual claims denormal support.
-    // Removed in gfx8.
-    // float, double restored in gfx10.
-    // double removed again in gfx11, so only f32 for gfx11/gfx12.
-    //
-    // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
-    // f32.
-    //
-    // FIXME: Check scope and fine grained memory
-    if (AS == AMDGPUAS::FLAT_ADDRESS) {
-      if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
-      if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
-    } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
-               AS == AMDGPUAS::BUFFER_FAT_POINTER) {
-      if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
-      if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
-        return ReportUnsafeHWInst(AtomicExpansionKind::None);
+    if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
+      // For flat and global cases:
+      // float, double in gfx7. Manual claims denormal support.
+      // Removed in gfx8.
+      // float, double restored in gfx10.
+      // double removed again in gfx11, so only f32 for gfx11/gfx12.
+      //
+      // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
+      // no f32.
+      if (AS == AMDGPUAS::FLAT_ADDRESS) {
+        if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+        if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+      } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
+                 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+        if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+        if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
+          return ReportUnsafeHWInst(AtomicExpansionKind::None);
+      }
     }
 
     return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index c701e873fdd2c..61d2c854dffa5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -318,29 +318,11 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -371,55 +353,21 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -500,27 +448,10 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -534,28 +465,11 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -585,53 +499,21 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -709,26 +591,9 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -770,27 +635,10 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -824,54 +672,19 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -929,34 +742,14 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -997,25 +790,10 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1048,50 +826,19 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1148,29 +895,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -1184,29 +911,11 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1237,55 +946,21 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1363,25 +1038,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -1395,28 +1054,11 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1446,53 +1088,23 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1567,24 +1179,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -1626,27 +1223,10 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1680,54 +1260,19 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1788,30 +1333,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -1852,25 +1376,10 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1903,50 +1412,20 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2003,28 +1482,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -2038,31 +1498,12 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v2, s[0:3], null offen
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v0, v3
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2095,65 +1536,27 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v2, s[0:3], 0 offen
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2255,28 +1658,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -2290,29 +1675,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v4
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2344,61 +1712,27 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_max_f32_e32 v3, v0, v0
-; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2497,27 +1831,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -2562,30 +1879,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, v1
-; GFX940-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX940-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2623,38 +1921,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s18
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX10-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2664,29 +1940,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s16
 ; GFX90A-NEXT:    s_mov_b32 s7, s17
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s18
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX90A-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2766,32 +2023,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX7-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -2835,28 +2070,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2892,36 +2110,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s18
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v7
-; GFX10-NEXT:    v_mov_b32_e32 v3, v8
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2931,27 +2129,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s16
 ; GFX90A-NEXT:    s_mov_b32 s7, s17
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s18
-; GFX90A-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3027,30 +2208,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, v3
-; GFX7-NEXT:    v_mov_b32_e32 v9, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v7, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v7
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v8
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 90110e6e0c09e..83be67a9138f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -318,29 +318,11 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -371,55 +353,21 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -500,27 +448,10 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -534,28 +465,11 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -585,53 +499,21 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -709,26 +591,9 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -770,27 +635,10 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -824,54 +672,19 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -929,34 +742,14 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -997,25 +790,10 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1048,50 +826,19 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1148,29 +895,9 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v11, v5
-; GFX7-NEXT:    v_mov_b32_e32 v10, v4
-; GFX7-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v8
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v9
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -1184,29 +911,11 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1237,55 +946,21 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1363,25 +1038,9 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -1395,28 +1054,11 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1446,53 +1088,23 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1567,24 +1179,9 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -1626,27 +1223,10 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX940-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX940-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v4
-; GFX940-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1680,54 +1260,19 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v7, v5
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
-; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1788,30 +1333,9 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX7-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -1852,25 +1376,10 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX940-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0
+; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1903,50 +1412,20 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX10-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX10-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v5, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX90A-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2003,28 +1482,9 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
 ; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v4, v[0:1]
-; GFX7-NEXT:    flat_load_dword v5, v[5:6]
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[2:3]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -2038,31 +1498,12 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v2, s[0:3], null offen
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v4, v0, v3
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2095,65 +1536,27 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v2, s[0:3], 0 offen
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_max_f32_e32 v3, v1, v1
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2255,28 +1658,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
-; GFX7-NEXT:    buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v1
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
@@ -2290,29 +1675,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_max_num_f32 v3, v0, v0
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v1, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_min_num_f32_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, v4
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2344,61 +1712,27 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_max_f32 v3, v0, v0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v4
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_max_f32_e32 v3, v0, v0
-; GFX10-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, v4
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2497,27 +1831,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
@@ -2562,30 +1879,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    v_mov_b32_e32 v2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, v1
-; GFX940-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[0:3], 0 offen
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
-; GFX940-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX940-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[8:9]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2623,38 +1921,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s18
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX10-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2664,29 +1940,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s16
 ; GFX90A-NEXT:    s_mov_b32 s7, s17
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s18
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2766,32 +2023,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[4:7], 0 offen
-; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX7-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
@@ -2835,28 +2070,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v6, s6
-; GFX940-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[0:3], 0 offen
-; GFX940-NEXT:    s_mov_b64 s[4:5], 0
-; GFX940-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX940-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX940-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[0:3], 0 offen sc0
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
-; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
-; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2892,36 +2110,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s18
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX10-NEXT:    s_mov_b32 s4, s6
 ; GFX10-NEXT:    s_mov_b32 s5, s7
 ; GFX10-NEXT:    s_mov_b32 s6, s16
 ; GFX10-NEXT:    s_mov_b32 s7, s17
-; GFX10-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX10-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v10, v3
-; GFX10-NEXT:    v_mov_b32_e32 v9, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GFX10-NEXT:    v_mov_b32_e32 v7, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v7
-; GFX10-NEXT:    v_mov_b32_e32 v3, v8
-; GFX10-NEXT:    s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2931,27 +2129,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    s_mov_b32 s5, s7
 ; GFX90A-NEXT:    s_mov_b32 s6, s16
 ; GFX90A-NEXT:    s_mov_b32 s7, s17
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s18
-; GFX90A-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX90A-NEXT:    s_mov_b64 s[8:9], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v6, s[4:7], 0 offen glc
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[2:3]
-; GFX90A-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3027,30 +2208,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX7-NEXT:    s_mov_b32 s5, s7
 ; GFX7-NEXT:    s_mov_b32 s6, s16
 ; GFX7-NEXT:    s_mov_b32 s7, s17
-; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[4:7], 0 offen
-; GFX7-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v10, v3
-; GFX7-NEXT:    v_mov_b32_e32 v9, v2
-; GFX7-NEXT:    v_mov_b32_e32 v8, v1
-; GFX7-NEXT:    v_mov_b32_e32 v7, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX7-NEXT:    v_mov_b32_e32 v2, v7
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v8
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %unused = atomicrmw fmin ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index ec0408236975d..172ce4c065e13 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -29,7 +29,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -58,7 +58,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -69,7 +69,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -84,7 +84,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -116,7 +116,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -149,7 +149,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -182,7 +182,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -195,7 +195,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -209,12 +209,12 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -229,7 +229,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -257,7 +257,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -268,7 +268,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -283,7 +283,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -314,7 +314,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -346,7 +346,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -378,7 +378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -391,7 +391,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -405,12 +405,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -444,7 +444,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -505,7 +505,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s1, exec_lo
@@ -535,7 +535,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, exec_lo
@@ -564,7 +564,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -623,7 +623,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
@@ -683,7 +683,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
@@ -743,7 +743,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
@@ -769,7 +769,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
@@ -796,90 +796,79 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s6, 0x400
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -888,64 +877,99 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_add_i32 s4, s18, 0x400
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s10, s16
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_mov_b32 s11, s17
 ; GFX908-NEXT:    s_mov_b32 s10, s16
 ; GFX908-NEXT:    s_mov_b32 s9, s7
 ; GFX908-NEXT:    s_mov_b32 s8, s6
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s18
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -953,36 +977,32 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    s_mov_b32 s11, s17
 ; GFX8-NEXT:    s_mov_b32 s10, s16
 ; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s18
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -990,669 +1010,1659 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s18
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
+; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s6
-; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s6, 0x400
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s10, s16
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_mov_b32 s11, s17
 ; GFX908-NEXT:    s_mov_b32 s10, s16
 ; GFX908-NEXT:    s_mov_b32 s9, s7
 ; GFX908-NEXT:    s_mov_b32 s8, s6
-; GFX908-NEXT:    v_mov_b32_e32 v2, s18
-; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, v3
-; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_mov_b32_e32 v8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v7, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    s_mov_b32 s11, s17
 ; GFX8-NEXT:    s_mov_b32 s10, s16
 ; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, v3
-; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    v_mov_b32_e32 v1, s18
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX12-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v6
-; GFX940-NEXT:    v_mov_b32_e32 v6, v5
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
-; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX11-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v8, v3
-; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v11
-; GFX908-NEXT:    v_mov_b32_e32 v1, v12
-; GFX908-NEXT:    v_mov_b32_e32 v2, v13
-; GFX908-NEXT:    v_mov_b32_e32 v3, v14
-; GFX908-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, s18
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v6
+; GFX940-NEXT:    v_mov_b32_e32 v6, v5
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
+; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v11
+; GFX908-NEXT:    v_mov_b32_e32 v1, v12
+; GFX908-NEXT:    v_mov_b32_e32 v2, v13
+; GFX908-NEXT:    v_mov_b32_e32 v3, v14
+; GFX908-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX908-NEXT:    v_mov_b32_e32 v14, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v13, v0
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v1, v12
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v14
+; GFX8-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX8-NEXT:    v_mov_b32_e32 v14, v1
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v13, v0
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-NEXT:    v_mov_b32_e32 v1, v6
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v1, v6
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    s_add_i32 s4, s18, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s18
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX908-NEXT:    v_mov_b32_e32 v14, v1
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v11
-; GFX8-NEXT:    v_mov_b32_e32 v1, v12
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v14
-; GFX8-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX8-NEXT:    v_mov_b32_e32 v14, v1
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %result
 }
 
@@ -1660,8 +2670,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
 ; half
 ; --------------------------------------------------------------------
 
-define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1680,7 +2690,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1703,13 +2713,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -1722,7 +2732,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1739,13 +2749,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -1761,7 +2771,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1784,13 +2794,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -1807,7 +2817,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1826,13 +2836,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -1849,7 +2859,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1865,13 +2875,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -1888,7 +2898,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1905,13 +2915,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -1928,7 +2938,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1946,13 +2956,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -1970,7 +2980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1989,14 +2999,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -2014,7 +3024,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2034,7 +3044,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
@@ -2042,12 +3052,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2066,7 +3076,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2089,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -2107,7 +3117,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2124,12 +3134,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -2145,7 +3155,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2168,12 +3178,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -2190,7 +3200,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2209,12 +3219,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -2231,7 +3241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2247,12 +3257,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -2269,7 +3279,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2286,12 +3296,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -2308,7 +3318,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2326,12 +3336,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -2349,7 +3359,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2368,12 +3378,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -2391,7 +3401,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2411,18 +3421,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2438,7 +3448,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2451,14 +3461,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -2474,7 +3484,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2489,8 +3499,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2499,13 +3509,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2516,7 +3526,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2528,14 +3538,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX940-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2545,7 +3555,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2559,8 +3569,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -2568,13 +3578,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -2587,7 +3597,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2600,14 +3610,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -2622,7 +3632,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2637,8 +3647,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2648,13 +3658,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -2665,7 +3675,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2677,13 +3687,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX10-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -2694,7 +3704,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2708,8 +3718,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2719,13 +3729,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2736,7 +3746,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2748,14 +3758,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX90A-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2764,7 +3774,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2777,8 +3787,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -2786,13 +3796,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2803,7 +3813,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2815,14 +3825,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX908-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2832,7 +3842,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2845,8 +3855,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2854,13 +3864,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -2871,7 +3881,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2883,14 +3893,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX8-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2901,7 +3911,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2914,8 +3924,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2923,13 +3933,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -2939,7 +3949,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2950,15 +3960,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX7-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -2970,7 +3980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX7-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2983,8 +3993,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2992,14 +4002,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -3009,7 +4019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3020,15 +4030,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX6-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -3040,7 +4050,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX6-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3053,8 +4063,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3062,7 +4072,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -3070,7 +4080,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
@@ -3078,8 +4088,8 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3098,7 +4108,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3127,13 +4137,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -3147,7 +4157,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3169,13 +4179,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -3191,7 +4201,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3221,13 +4231,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -3244,7 +4254,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3267,13 +4277,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -3291,7 +4301,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3311,13 +4321,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -3335,7 +4345,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3356,13 +4366,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -3379,7 +4389,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3403,13 +4413,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -3427,7 +4437,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3447,14 +4457,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -3472,7 +4482,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3493,7 +4503,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
@@ -3501,12 +4511,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3525,7 +4535,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3554,12 +4564,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -3573,7 +4583,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3595,12 +4605,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -3616,7 +4626,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3646,12 +4656,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -3668,7 +4678,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3691,12 +4701,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -3714,7 +4724,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3734,12 +4744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -3757,7 +4767,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3778,12 +4788,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -3800,7 +4810,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3824,12 +4834,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -3847,7 +4857,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3867,12 +4877,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -3890,7 +4900,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3911,18 +4921,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3938,7 +4948,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3951,14 +4961,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -3980,7 +4990,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3995,8 +5005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4005,13 +5015,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4022,7 +5032,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4034,15 +5044,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX940-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
@@ -4057,7 +5067,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4071,8 +5081,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4080,13 +5090,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4099,7 +5109,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4112,15 +5122,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -4142,7 +5152,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4157,8 +5167,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4168,14 +5178,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4186,7 +5196,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4198,13 +5208,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -4219,7 +5229,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4233,8 +5243,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4244,13 +5254,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4261,7 +5271,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4273,15 +5283,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v11
@@ -4294,7 +5304,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4307,8 +5317,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4316,13 +5326,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4333,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4345,15 +5355,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX908-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_max_f32_e32 v4, v4, v10
@@ -4367,7 +5377,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4380,8 +5390,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4389,13 +5399,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -4406,7 +5416,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4418,14 +5428,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v10
@@ -4441,7 +5451,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4454,8 +5464,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4463,13 +5473,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -4479,7 +5489,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4490,15 +5500,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX7-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -4511,7 +5521,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4524,8 +5534,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4533,14 +5543,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -4550,7 +5560,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4561,15 +5571,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX6-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -4582,7 +5592,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4595,8 +5605,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4604,7 +5614,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -4612,7 +5622,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
@@ -4620,8 +5630,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4635,7 +5645,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -4652,12 +5662,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -4667,7 +5677,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -4682,12 +5692,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
@@ -4697,7 +5707,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -4715,12 +5725,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -4734,7 +5744,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -4750,12 +5760,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -4769,7 +5779,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -4782,12 +5792,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -4801,7 +5811,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -4815,12 +5825,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -4835,7 +5845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -4852,12 +5862,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -4877,7 +5887,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s6
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4902,12 +5912,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -4927,7 +5937,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4953,18 +5963,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4977,7 +5987,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
@@ -4994,12 +6004,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -5008,7 +6018,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5023,12 +6033,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -5037,7 +6047,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5054,12 +6064,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -5072,7 +6082,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5088,12 +6098,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -5106,7 +6116,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5119,12 +6129,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -5137,7 +6147,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5151,12 +6161,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -5170,7 +6180,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -5187,12 +6197,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -5212,7 +6222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5237,12 +6247,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -5262,7 +6272,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5288,18 +6298,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5308,7 +6318,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5322,14 +6332,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -5339,7 +6349,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5354,8 +6364,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5364,18 +6374,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5388,21 +6398,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX940-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_pk_max_f16 v6, v4, v9
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5416,8 +6426,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5425,19 +6435,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5451,14 +6461,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -5467,7 +6477,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5482,8 +6492,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5493,19 +6503,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5518,13 +6528,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX10-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -5532,7 +6542,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5546,8 +6556,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5557,18 +6567,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5581,20 +6591,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX90A-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX90A-NEXT:    v_pk_max_f16 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5607,8 +6617,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5616,18 +6626,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5640,21 +6650,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX908-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX908-NEXT:    v_pk_max_f16 v5, v4, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5667,8 +6677,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5676,18 +6686,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5700,15 +6710,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
-; GFX8-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
@@ -5718,7 +6728,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5731,8 +6741,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5740,18 +6750,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5763,7 +6773,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -5775,9 +6785,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
@@ -5793,7 +6803,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5806,8 +6816,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -5817,19 +6827,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5841,7 +6851,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -5853,9 +6863,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
@@ -5872,7 +6882,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5885,8 +6895,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -5896,7 +6906,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v4
@@ -5904,7 +6914,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
@@ -5912,8 +6922,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5928,7 +6938,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v0
@@ -5961,12 +6971,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -5979,7 +6989,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v0
@@ -6006,12 +7016,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
@@ -6024,7 +7034,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -6058,13 +7068,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -6079,7 +7089,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -6108,12 +7118,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -6130,7 +7140,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
@@ -6156,12 +7166,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -6178,7 +7188,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -6205,12 +7215,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -6225,7 +7235,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6255,12 +7265,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -6279,7 +7289,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s6
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6301,12 +7311,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -6325,7 +7335,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6348,18 +7358,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6372,7 +7382,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6403,12 +7413,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -6420,7 +7430,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6447,12 +7457,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
@@ -6463,7 +7473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6495,13 +7505,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -6515,7 +7525,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6544,12 +7554,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -6565,7 +7575,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6591,12 +7601,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -6612,7 +7622,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6639,12 +7649,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -6658,7 +7668,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6688,12 +7698,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -6712,7 +7722,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -6734,12 +7744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -6758,7 +7768,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -6781,18 +7791,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6801,7 +7811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6815,15 +7825,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -6847,7 +7857,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6862,8 +7872,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6872,18 +7882,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6896,7 +7906,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
@@ -6904,9 +7914,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
-; GFX940-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX940-NEXT:    v_max_f32_e32 v4, v4, v9
@@ -6927,7 +7937,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
 ; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6941,8 +7951,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6950,19 +7960,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6976,16 +7986,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -7009,7 +8019,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7024,8 +8034,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7035,20 +8045,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7061,14 +8071,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
@@ -7089,7 +8099,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7103,8 +8113,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7114,18 +8124,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7138,7 +8148,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
@@ -7146,9 +8156,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v9
@@ -7167,7 +8177,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7180,8 +8190,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7189,18 +8199,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7213,7 +8223,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
@@ -7221,9 +8231,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX908-NEXT:    v_max_f32_e32 v4, v4, v8
@@ -7243,7 +8253,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7256,8 +8266,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7265,18 +8275,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7289,15 +8299,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v8
@@ -7320,7 +8330,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7333,8 +8343,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7342,18 +8352,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7365,7 +8375,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7376,9 +8386,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7392,7 +8402,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7405,8 +8415,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7415,19 +8425,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7439,7 +8449,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7450,9 +8460,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7466,7 +8476,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7479,8 +8489,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7490,14 +8500,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
@@ -7505,43 +8515,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; misc
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v0, v2
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -7551,7 +8541,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -7565,79 +8555,38 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_add_i32 s4, s18, 0x400
-; GFX10-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -7651,7 +8600,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -7666,12 +8615,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -7685,7 +8634,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -7699,12 +8648,12 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -7718,7 +8667,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
@@ -7732,83 +8681,42 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v0, s18
-; GFX7-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s18, 0x400
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX6-NEXT:    s_add_i32 s6, s18, 0x400
-; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s6
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v1, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s18
+; GFX6-NEXT:    buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-
+attributes #0 = { nounwind }
 
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index cd01cc7309fcd..61ee956747135 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -29,7 +29,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -58,7 +58,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -69,7 +69,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -84,7 +84,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -116,7 +116,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -149,7 +149,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -182,7 +182,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -195,7 +195,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -209,12 +209,12 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -229,7 +229,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -257,7 +257,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -268,7 +268,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -283,7 +283,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -314,7 +314,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -346,7 +346,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -378,7 +378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -391,7 +391,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -405,12 +405,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -444,7 +444,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -505,7 +505,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s1, exec_lo
@@ -535,7 +535,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, exec_lo
@@ -564,7 +564,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
@@ -623,7 +623,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
@@ -683,7 +683,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
@@ -743,7 +743,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
@@ -769,7 +769,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
@@ -796,90 +796,79 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s6, 0x400
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -888,64 +877,99 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_add_i32 s4, s18, 0x400
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s10, s16
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_mov_b32 s11, s17
 ; GFX908-NEXT:    s_mov_b32 s10, s16
 ; GFX908-NEXT:    s_mov_b32 s9, s7
 ; GFX908-NEXT:    s_mov_b32 s8, s6
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s18
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
@@ -953,36 +977,32 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    s_mov_b32 s11, s17
 ; GFX8-NEXT:    s_mov_b32 s10, s16
 ; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s18
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
@@ -990,669 +1010,1659 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_mov_b32_e32 v0, s18
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX7-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX6-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s6
+; GFX6-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s6
-; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT:    s_addk_i32 s6, 0x400
+; GFX940-NEXT:    s_mov_b64 s[4:5], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, s6
+; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v0
+; GFX940-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
 ; GFX90A-NEXT:    s_mov_b32 s10, s16
 ; GFX90A-NEXT:    s_mov_b32 s9, s7
 ; GFX90A-NEXT:    s_mov_b32 s8, s6
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x400
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
+; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    s_mov_b32 s11, s17
 ; GFX908-NEXT:    s_mov_b32 s10, s16
 ; GFX908-NEXT:    s_mov_b32 s9, s7
 ; GFX908-NEXT:    s_mov_b32 s8, s6
-; GFX908-NEXT:    v_mov_b32_e32 v2, s18
-; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v10, v3
-; GFX908-NEXT:    v_mov_b32_e32 v9, v2
-; GFX908-NEXT:    v_mov_b32_e32 v8, v1
-; GFX908-NEXT:    v_mov_b32_e32 v7, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    v_mov_b32_e32 v5, v0
+; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
+; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    s_mov_b32 s11, s17
 ; GFX8-NEXT:    s_mov_b32 s10, s16
 ; GFX8-NEXT:    s_mov_b32 s9, s7
 ; GFX8-NEXT:    s_mov_b32 s8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v10, v3
-; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v2, s18
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s18
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    v_mov_b32_e32 v1, s18
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret float %result
 }
 
-define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
-; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
-; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX12-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v7, v6
-; GFX940-NEXT:    v_mov_b32_e32 v6, v5
-; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
-; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-NEXT:    ; implicit-def: $vgpr4
-; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX940-NEXT:  ; %bb.2:
-; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, v6
-; GFX940-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX11-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX10-NEXT:    ; implicit-def: $vgpr4
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX10-NEXT:  ; %bb.2:
-; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v5
-; GFX10-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
-; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
-; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
-; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT:  ; %bb.2:
-; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v8, v3
-; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
-; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX908-NEXT:  ; %bb.2:
-; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB5_4 Depth 2
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v11
-; GFX908-NEXT:    v_mov_b32_e32 v1, v12
-; GFX908-NEXT:    v_mov_b32_e32 v2, v13
-; GFX908-NEXT:    v_mov_b32_e32 v3, v14
-; GFX908-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
+; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
+; GFX11-NEXT:    v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, s18
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v3
+; GFX908-NEXT:    v_mov_b32_e32 v9, v2
+; GFX908-NEXT:    v_mov_b32_e32 v8, v1
+; GFX908-NEXT:    v_mov_b32_e32 v7, v0
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v7
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v8
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v10, v3
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
+; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Loop Header: Depth=1
+; GFX12-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX12-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX12-NEXT:    s_mov_b32 exec_lo, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX12-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v6
+; GFX940-NEXT:    v_mov_b32_e32 v6, v5
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX940-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
+; GFX940-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT:    ; implicit-def: $vgpr4
+; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:  ; %bb.2:
+; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v6
+; GFX940-NEXT:    v_mov_b32_e32 v1, v7
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Loop Header: Depth=1
+; GFX11-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s5, exec_lo
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:  ; %bb.2:
+; GFX10-NEXT:    s_mov_b32 exec_lo, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    v_mov_b32_e32 v1, v6
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, v5
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:  ; %bb.2:
+; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v7
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v8, v3
+; GFX908-NEXT:    v_mov_b32_e32 v7, v2
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_nop 0
+; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:  ; %bb.2:
+; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Loop Header: Depth=1
+; GFX908-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v11
+; GFX908-NEXT:    v_mov_b32_e32 v1, v12
+; GFX908-NEXT:    v_mov_b32_e32 v2, v13
+; GFX908-NEXT:    v_mov_b32_e32 v3, v14
+; GFX908-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX908-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX908-NEXT:    v_mov_b32_e32 v14, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v13, v0
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v8, v3
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:  ; %bb.2:
+; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Loop Header: Depth=1
+; GFX8-NEXT:    ; Child Loop BB7_4 Depth 2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v11
+; GFX8-NEXT:    v_mov_b32_e32 v1, v12
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v14
+; GFX8-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
+; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
+; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX8-NEXT:    v_mov_b32_e32 v14, v1
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v13, v0
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[6:7], exec
+; GFX7-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:  ; %bb.2:
+; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-NEXT:    v_mov_b32_e32 v1, v6
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
+; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX6-NEXT:    ; implicit-def: $vgpr4
+; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:  ; %bb.2:
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v1, v6
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX10-NEXT:    s_add_i32 s4, s18, 0x800
+; GFX10-NEXT:    v_mov_b32_e32 v6, s4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    v_mov_b32_e32 v1, v8
+; GFX10-NEXT:    v_mov_b32_e32 v2, v9
+; GFX10-NEXT:    v_mov_b32_e32 v3, v10
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s18
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
+; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX90A-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s18
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX7-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v10, v1
+; GFX7-NEXT:    v_mov_b32_e32 v9, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
+; GFX7-NEXT:    v_mov_b32_e32 v1, v8
+; GFX7-NEXT:    v_mov_b32_e32 v2, v9
+; GFX7-NEXT:    v_mov_b32_e32 v3, v10
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX6-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v9, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v7
+; GFX6-NEXT:    v_mov_b32_e32 v1, v8
+; GFX6-NEXT:    v_mov_b32_e32 v2, v9
+; GFX6-NEXT:    v_mov_b32_e32 v3, v10
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
+}
+
+define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    s_add_i32 s4, s6, 0x800
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_mov_b32 s11, s17
+; GFX10-NEXT:    s_mov_b32 s10, s16
+; GFX10-NEXT:    s_mov_b32 s9, s7
+; GFX10-NEXT:    s_mov_b32 s8, s6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_mov_b32 s11, s17
+; GFX90A-NEXT:    s_mov_b32 s10, s16
+; GFX90A-NEXT:    s_mov_b32 s9, s7
+; GFX90A-NEXT:    s_mov_b32 s8, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s18
+; GFX90A-NEXT:    buffer_atomic_min_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_mov_b32 s11, s17
+; GFX908-NEXT:    s_mov_b32 s10, s16
+; GFX908-NEXT:    s_mov_b32 s9, s7
+; GFX908-NEXT:    s_mov_b32 s8, s6
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, s18
+; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v1
+; GFX908-NEXT:    v_mov_b32_e32 v9, v0
+; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v7
+; GFX908-NEXT:    v_mov_b32_e32 v1, v8
+; GFX908-NEXT:    v_mov_b32_e32 v2, v9
+; GFX908-NEXT:    v_mov_b32_e32 v3, v10
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX908-NEXT:    v_mov_b32_e32 v14, v1
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, v3
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    s_mov_b32 s11, s17
+; GFX8-NEXT:    s_mov_b32 s10, s16
+; GFX8-NEXT:    s_mov_b32 s9, s7
+; GFX8-NEXT:    s_mov_b32 s8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    s_add_i32 s6, s18, 0x800
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
-; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX8-NEXT:  ; %bb.2:
-; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0
-; GFX8-NEXT:  .LBB5_3: ; %atomicrmw.start
-; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB5_4 Depth 2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v11
-; GFX8-NEXT:    v_mov_b32_e32 v1, v12
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v14
-; GFX8-NEXT:  .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
-; GFX8-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX8-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX8-NEXT:    v_readfirstlane_b32 s11, v8
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v7
+; GFX8-NEXT:    v_mov_b32_e32 v1, v8
+; GFX8-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NEXT:    v_mov_b32_e32 v3, v10
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX8-NEXT:    v_mov_b32_e32 v14, v1
-; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v13, v0
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB5_3
-; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX7-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX7-NEXT:    ; implicit-def: $vgpr4
-; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX7-NEXT:  ; %bb.2:
-; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX7-NEXT:    s_mov_b32 s11, s17
+; GFX7-NEXT:    s_mov_b32 s10, s16
+; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s8, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s18
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, v5
-; GFX7-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
-; GFX6-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX6-NEXT:    ; implicit-def: $vgpr4
-; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX6-NEXT:  ; %bb.2:
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b32 s11, s17
+; GFX6-NEXT:    s_mov_b32 s10, s16
+; GFX6-NEXT:    s_mov_b32 s9, s7
+; GFX6-NEXT:    s_mov_b32 s8, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s18
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %result
 }
 
@@ -1660,8 +2670,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
 ; half
 ; --------------------------------------------------------------------
 
-define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1680,7 +2690,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1703,13 +2713,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -1722,7 +2732,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1739,13 +2749,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -1761,7 +2771,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1784,13 +2794,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -1807,7 +2817,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -1826,13 +2836,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -1849,7 +2859,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1865,13 +2875,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -1888,7 +2898,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1905,13 +2915,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -1928,7 +2938,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v5, v0, v0
-; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1946,13 +2956,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -1970,7 +2980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -1989,14 +2999,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -2014,7 +3024,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2034,7 +3044,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
@@ -2042,12 +3052,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2066,7 +3076,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2089,12 +3099,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -2107,7 +3117,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_not_b32 s7, s4
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2124,12 +3134,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -2145,7 +3155,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2168,12 +3178,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -2190,7 +3200,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -2209,12 +3219,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -2231,7 +3241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_not_b32 s7, s4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2247,12 +3257,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -2269,7 +3279,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_not_b32 s7, s4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2286,12 +3296,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -2308,7 +3318,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2326,12 +3336,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -2349,7 +3359,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2368,12 +3378,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -2391,7 +3401,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -2411,18 +3421,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2438,7 +3448,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2451,14 +3461,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_max_num_f16_e32 v10, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -2474,7 +3484,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX12-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2489,8 +3499,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2499,13 +3509,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2516,7 +3526,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2528,14 +3538,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX940-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX940-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2545,7 +3555,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX940-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2559,8 +3569,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -2568,13 +3578,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -2587,7 +3597,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -2600,14 +3610,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_max_f16_e32 v10, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -2622,7 +3632,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX11-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -2637,8 +3647,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2648,13 +3658,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -2665,7 +3675,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2677,13 +3687,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX10-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -2694,7 +3704,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX10-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2708,8 +3718,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -2719,13 +3729,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2736,7 +3746,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2748,14 +3758,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX90A-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v8, v7
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2764,7 +3774,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX90A-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2777,8 +3787,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -2786,13 +3796,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -2803,7 +3813,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2815,14 +3825,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX908-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX908-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2832,7 +3842,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX908-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2845,8 +3855,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2854,13 +3864,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -2871,7 +3881,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2883,14 +3893,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_e32 v10, v5, v5
-; GFX8-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
@@ -2901,7 +3911,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX8-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2914,8 +3924,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2923,13 +3933,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -2939,7 +3949,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -2950,15 +3960,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX7-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -2970,7 +3980,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX7-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -2983,8 +3993,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -2992,14 +4002,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -3009,7 +4019,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -3020,15 +4030,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v4
-; GFX6-NEXT:  .LBB8_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB12_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB8_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB12_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -3040,7 +4050,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB8_4: ; Parent Loop BB8_3 Depth=1
+; GFX6-NEXT:  .LBB12_4: ; Parent Loop BB12_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -3053,8 +4063,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -3062,7 +4072,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB12_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -3070,7 +4080,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
@@ -3078,8 +4088,8 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3098,7 +4108,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    buffer_load_b32 v1, v4, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3127,13 +4137,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -3147,7 +4157,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3169,13 +4179,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -3191,7 +4201,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3221,13 +4231,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -3244,7 +4254,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3267,13 +4277,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, s4, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -3291,7 +4301,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3311,13 +4321,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -3335,7 +4345,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3356,13 +4366,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -3379,7 +4389,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3403,13 +4413,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -3427,7 +4437,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3447,14 +4457,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -3472,7 +4482,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX6-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3493,7 +4503,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v2
@@ -3501,12 +4511,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3525,7 +4535,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    buffer_load_b32 v1, v2, s[0:3], null offen
 ; GFX12-NEXT:    s_not_b32 s6, s5
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3554,12 +4564,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_addk_i32 s6, 0x200
@@ -3573,7 +4583,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX940-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3595,12 +4605,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_addk_i32 s6, 0x200
@@ -3616,7 +4626,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_not_b32 s6, s5
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, s4, v1
@@ -3646,12 +4656,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_addk_i32 s18, 0x200
@@ -3668,7 +4678,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    s_lshl_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    s_not_b32 s6, s5
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3691,12 +4701,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_addk_i32 s18, 0x200
@@ -3714,7 +4724,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3734,12 +4744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_addk_i32 s18, 0x200
@@ -3757,7 +4767,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s12, 0x7fff
-; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3778,12 +4788,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_addk_i32 s18, 0x200
@@ -3800,7 +4810,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_not_b32 s7, s4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3824,12 +4834,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_addk_i32 s18, 0x200
@@ -3847,7 +4857,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_not_b32 s7, s4
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3867,12 +4877,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_addk_i32 s18, 0x200
@@ -3890,7 +4900,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_not_b32 s7, s4
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX6-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s6, v1
@@ -3911,18 +4921,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3938,7 +4948,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX12-NEXT:    v_not_b32_e32 v9, v6
-; GFX12-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -3951,14 +4961,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    buffer_load_b32 v6, v8, s[4:7], null offen
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -3980,7 +4990,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX12-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -3995,8 +5005,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4005,13 +5015,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4022,7 +5032,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_e64 v4, v8, s0
 ; GFX940-NEXT:    v_not_b32_e32 v10, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4034,15 +5044,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX940-NEXT:    buffer_load_dword v7, v9, s[4:7], 0 offen
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX940-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
@@ -4057,7 +5067,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX940-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX940-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4071,8 +5081,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4080,13 +5090,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4099,7 +5109,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX11-NEXT:    v_not_b32_e32 v9, v6
-; GFX11-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -4112,15 +5122,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v6, v8, s[4:7], 0 offen
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -4142,7 +5152,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX11-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -4157,8 +5167,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4168,14 +5178,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x200, v4
@@ -4186,7 +5196,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v6, v7, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v9, v6
-; GFX10-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4198,13 +5208,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -4219,7 +5229,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    v_and_or_b32 v5, v6, v9, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX10-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4233,8 +5243,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -4244,13 +5254,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4261,7 +5271,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_lshlrev_b32_e64 v4, v8, s4
 ; GFX90A-NEXT:    v_not_b32_e32 v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4273,15 +5283,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    buffer_load_dword v7, v9, s[8:11], 0 offen
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v11
@@ -4294,7 +5304,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_and_or_b32 v6, v7, v10, v4
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX90A-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4307,8 +5317,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -4316,13 +5326,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v8, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v4, 0x200, v4
@@ -4333,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX908-NEXT:    v_not_b32_e32 v9, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4345,15 +5355,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX908-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_min_f32_e32 v4, v4, v10
@@ -4367,7 +5377,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX908-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4380,8 +5390,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4389,13 +5399,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x200, v4
@@ -4406,7 +5416,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v4, v7, s4
 ; GFX8-NEXT:    v_not_b32_e32 v9, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4418,14 +5428,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX8-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v10
@@ -4441,7 +5451,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX8-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4454,8 +5464,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4463,13 +5473,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -4479,7 +5489,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX7-NEXT:    v_not_b32_e32 v9, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4490,15 +5500,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX7-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -4511,7 +5521,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v6
-; GFX7-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX7-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4524,8 +5534,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4533,14 +5543,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x200, v4
@@ -4550,7 +5560,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_lshl_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_not_b32_e32 v9, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB11_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB15_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -4561,15 +5571,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v6, v8, s[8:11], 0 offen
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v5
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX6-NEXT:  .LBB11_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB15_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB11_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB15_4 Depth 2
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -4582,7 +5592,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v6
-; GFX6-NEXT:  .LBB11_4: ; Parent Loop BB11_3 Depth=1
+; GFX6-NEXT:  .LBB15_4: ; Parent Loop BB15_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -4595,8 +5605,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -4604,7 +5614,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v7, v4
@@ -4612,7 +5622,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
@@ -4620,8 +5630,8 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4635,7 +5645,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v0
@@ -4652,12 +5662,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -4667,7 +5677,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -4682,12 +5692,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
@@ -4697,7 +5707,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v0
@@ -4715,12 +5725,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -4734,7 +5744,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
@@ -4750,12 +5760,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -4769,7 +5779,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -4782,12 +5792,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -4801,7 +5811,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -4815,12 +5825,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -4835,7 +5845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -4852,12 +5862,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -4877,7 +5887,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s6
-; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4902,12 +5912,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -4927,7 +5937,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4953,18 +5963,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4977,7 +5987,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v0, v1, v1
@@ -4994,12 +6004,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -5008,7 +6018,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5023,12 +6033,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
@@ -5037,7 +6047,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5054,12 +6064,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -5072,7 +6082,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5088,12 +6098,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -5106,7 +6116,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5119,12 +6129,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -5137,7 +6147,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v0, v1, v1
@@ -5151,12 +6161,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -5170,7 +6180,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -5187,12 +6197,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -5212,7 +6222,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5237,12 +6247,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -5262,7 +6272,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -5288,18 +6298,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5308,7 +6318,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5322,14 +6332,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_pk_max_num_f16 v8, v5, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
@@ -5339,7 +6349,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5354,8 +6364,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5364,18 +6374,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5388,21 +6398,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX940-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX940-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    v_pk_min_f16 v6, v4, v9
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX940-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5416,8 +6426,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5425,19 +6435,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -5451,14 +6461,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_pk_max_f16 v8, v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
@@ -5467,7 +6477,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -5482,8 +6492,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5493,19 +6503,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5518,13 +6528,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX10-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
@@ -5532,7 +6542,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5546,8 +6556,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -5557,18 +6567,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5581,20 +6591,20 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
-; GFX90A-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
 ; GFX90A-NEXT:    v_pk_min_f16 v6, v4, v9
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5607,8 +6617,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -5616,18 +6626,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5640,21 +6650,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_pk_max_f16 v8, v5, v5
-; GFX908-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
 ; GFX908-NEXT:    v_pk_min_f16 v5, v4, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5667,8 +6677,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5676,18 +6686,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5700,15 +6710,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v9, v5, v5
-; GFX8-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
@@ -5718,7 +6728,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5731,8 +6741,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -5740,18 +6750,18 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5763,7 +6773,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -5775,9 +6785,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
@@ -5793,7 +6803,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5806,8 +6816,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -5817,19 +6827,19 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -5841,7 +6851,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -5853,9 +6863,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v10, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v11, v8
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
-; GFX6-NEXT:  .LBB14_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
@@ -5872,7 +6882,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    v_or_b32_e32 v5, v7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
-; GFX6-NEXT:  .LBB14_4: ; Parent Loop BB14_3 Depth=1
+; GFX6-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -5885,8 +6895,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
@@ -5896,7 +6906,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v4
@@ -5904,7 +6914,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
@@ -5912,8 +6922,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5928,7 +6938,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v0
@@ -5961,12 +6971,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -5979,7 +6989,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v0
@@ -6006,12 +7016,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
@@ -6024,7 +7034,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
@@ -6058,13 +7068,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
@@ -6079,7 +7089,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
@@ -6108,12 +7118,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -6130,7 +7140,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
@@ -6156,12 +7166,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -6178,7 +7188,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v0
@@ -6205,12 +7215,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -6225,7 +7235,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6255,12 +7265,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -6279,7 +7289,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s6
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6301,12 +7311,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -6325,7 +7335,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s6
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -6348,18 +7358,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB19_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6372,7 +7382,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:    s_mov_b32 s5, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6403,12 +7413,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s6
@@ -6420,7 +7430,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX940-NEXT:    s_mov_b32 s9, 0x7060302
 ; GFX940-NEXT:    v_mov_b32_e32 v4, s4
-; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6447,12 +7457,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
@@ -6463,7 +7473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
@@ -6495,13 +7505,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s18
@@ -6515,7 +7525,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6544,12 +7554,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s11, s17
@@ -6565,7 +7575,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
-; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6591,12 +7601,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v6
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s11, s17
@@ -6612,7 +7622,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s13, 0x7060302
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s4
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6639,12 +7649,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s11, s17
@@ -6658,7 +7668,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
@@ -6688,12 +7698,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s11, s17
@@ -6712,7 +7722,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -6734,12 +7744,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, s17
@@ -6758,7 +7768,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
-; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -6781,18 +7791,18 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6801,7 +7811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6815,15 +7825,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
-; GFX12-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX12-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -6847,7 +7857,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
-; GFX12-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX12-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6862,8 +7872,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -6872,18 +7882,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX940-NEXT:    s_mov_b64 s[2:3], exec
-; GFX940-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX940-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6896,7 +7906,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
 ; GFX940-NEXT:    ; implicit-def: $vgpr4
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX940-NEXT:    s_mov_b64 s[2:3], 0
@@ -6904,9 +7914,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_movk_i32 s10, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX940-NEXT:    s_mov_b32 s11, 0x7060302
-; GFX940-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Loop Header: Depth=1
-; GFX940-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX940-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX940-NEXT:    v_min_f32_e32 v4, v4, v9
@@ -6927,7 +7937,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
 ; GFX940-NEXT:    v_perm_b32 v6, v5, v4, s11
 ; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX940-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX940-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX940-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX940-NEXT:    v_readfirstlane_b32 s5, v1
@@ -6941,8 +7951,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
 ; GFX940-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX940-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX940-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -6950,19 +7960,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX940-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
@@ -6976,16 +7986,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX11-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
@@ -7009,7 +8019,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
-; GFX11-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX11-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
@@ -7024,8 +8034,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7035,20 +8045,20 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
-; GFX10-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7061,14 +8071,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
@@ -7089,7 +8099,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
-; GFX10-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7103,8 +8113,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
@@ -7114,18 +8124,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
-; GFX90A-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7138,7 +8148,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
@@ -7146,9 +8156,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v9
@@ -7167,7 +8177,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7180,8 +8190,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
@@ -7189,18 +8199,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
-; GFX908-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7213,7 +8223,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
@@ -7221,9 +8231,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_movk_i32 s14, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX908-NEXT:    s_mov_b32 s15, 0x7060302
-; GFX908-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX908-NEXT:    v_min_f32_e32 v4, v4, v8
@@ -7243,7 +8253,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
-; GFX908-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX908-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7256,8 +8266,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7265,18 +8275,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7289,15 +8299,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v8
@@ -7320,7 +8330,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
-; GFX8-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7333,8 +8343,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
@@ -7342,18 +8352,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
-; GFX7-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX7-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7365,7 +8375,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7376,9 +8386,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
-; GFX7-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7392,7 +8402,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX7-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7405,8 +8415,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7415,19 +8425,19 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:  .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v2
@@ -7439,7 +8449,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX6-NEXT:    ; implicit-def: $vgpr4
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX6-NEXT:  ; %bb.2:
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v6
@@ -7450,9 +8460,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; GFX6-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT:  .LBB17_3: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Loop Header: Depth=1
-; GFX6-NEXT:    ; Child Loop BB17_4 Depth 2
+; GFX6-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 1.0, v7
@@ -7466,7 +8476,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX6-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX6-NEXT:    v_mov_b32_e32 v6, v4
-; GFX6-NEXT:  .LBB17_4: ; Parent Loop BB17_3 Depth=1
+; GFX6-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX6-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s9, v1
@@ -7479,8 +8489,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
 ; GFX6-NEXT:    s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX6-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX6-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
@@ -7490,14 +8500,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT:    s_cbranch_execnz .LBB17_3
+; GFX6-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX6-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
@@ -7505,43 +8515,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; misc
 ; --------------------------------------------------------------------
 
-define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT:    s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v4, v0, v2
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, v0
@@ -7551,7 +8541,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, s6
-; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v0
@@ -7565,79 +8555,38 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX940-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT:    s_add_i32 s4, s6, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX10-NEXT:    s_mov_b32 s11, s17
 ; GFX10-NEXT:    s_mov_b32 s10, s16
 ; GFX10-NEXT:    s_mov_b32 s9, s7
 ; GFX10-NEXT:    s_mov_b32 s8, s6
-; GFX10-NEXT:    s_add_i32 s4, s18, 0x400
-; GFX10-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
@@ -7651,7 +8600,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
-; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
@@ -7666,12 +8615,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
@@ -7685,7 +8634,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v3, s6
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
@@ -7699,12 +8648,12 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
@@ -7718,7 +8667,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
@@ -7732,83 +8681,42 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    s_mov_b32 s11, s17
 ; GFX7-NEXT:    s_mov_b32 s10, s16
 ; GFX7-NEXT:    s_mov_b32 s9, s7
 ; GFX7-NEXT:    s_mov_b32 s8, s6
-; GFX7-NEXT:    v_mov_b32_e32 v0, s18
-; GFX7-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s18, 0x400
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
-; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s18
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX6-NEXT:    s_mov_b32 s11, s17
 ; GFX6-NEXT:    s_mov_b32 s10, s16
 ; GFX6-NEXT:    s_mov_b32 s9, s7
 ; GFX6-NEXT:    s_mov_b32 s8, s6
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
-; GFX6-NEXT:    s_add_i32 s6, s18, 0x400
-; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s6
-; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v1, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s18
+; GFX6-NEXT:    buffer_atomic_fmin v0, v1, s[8:11], 0 offen offset:1024 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
-  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-
+attributes #0 = { nounwind }
 
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index a3424793fdc4d..ea2427a3c420f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -929,29 +929,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -982,56 +964,23 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1112,26 +1061,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1146,28 +1080,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1197,28 +1114,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1226,26 +1127,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1326,24 +1213,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1393,21 +1265,55 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
@@ -1485,9 +1391,25 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
@@ -2557,29 +2479,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2610,56 +2514,23 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2740,26 +2611,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmax v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2774,28 +2630,11 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2825,28 +2664,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2854,26 +2677,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2954,24 +2763,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmax v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -4016,19 +3810,54 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -4089,9 +3918,30 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX7-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %result
@@ -15211,7 +15061,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 
 !0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 0d954e277cdd5..2767b66e44703 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -929,29 +929,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -982,56 +964,23 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1112,26 +1061,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1146,28 +1080,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1197,28 +1114,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1226,26 +1127,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1326,24 +1213,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1393,21 +1265,55 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
@@ -1485,9 +1391,25 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
@@ -2557,29 +2479,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2610,56 +2514,23 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT:    flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v0, v[3:4]
-; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2740,26 +2611,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v0, v[3:4]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_fmin v0, v[0:1], v2 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2774,28 +2630,11 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2825,28 +2664,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2854,26 +2677,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7fc, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    flat_load_dword v3, v[0:1]
-; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2954,24 +2763,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x7fc, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT:    flat_load_dword v3, v[0:1]
-; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT:    flat_atomic_fmin v[0:1], v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v2
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %ptr, i64 511
   %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -4016,19 +3810,54 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -4089,9 +3918,30 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
 ; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX7-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %result
@@ -15211,7 +15061,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 
 !0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 4f7b6164936f8..06d971febd038 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -28,7 +28,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -53,7 +53,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -63,7 +63,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -73,7 +73,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -97,7 +97,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -121,7 +121,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -145,7 +145,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -158,7 +158,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -171,12 +171,12 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -190,7 +190,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -215,7 +215,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -225,7 +225,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -235,7 +235,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -259,7 +259,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -283,7 +283,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -308,7 +308,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -321,7 +321,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -335,12 +335,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -354,7 +354,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -379,7 +379,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -389,7 +389,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -399,7 +399,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -423,7 +423,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -447,7 +447,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -472,7 +472,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -485,7 +485,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -499,12 +499,12 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32:
+define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -518,7 +518,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -542,7 +542,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -552,7 +552,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -562,7 +562,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -585,7 +585,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -608,7 +608,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -631,7 +631,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -643,7 +643,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -655,12 +655,12 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -674,7 +674,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -698,7 +698,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -708,7 +708,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -718,7 +718,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -741,7 +741,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -764,7 +764,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -789,7 +789,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -801,7 +801,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -814,12 +814,12 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -833,7 +833,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -857,7 +857,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -867,7 +867,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -877,7 +877,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -900,7 +900,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -923,7 +923,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -948,7 +948,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -960,7 +960,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -973,44 +973,26 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1035,61 +1017,27 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1115,7 +1063,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1139,7 +1087,7 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1164,105 +1112,53 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1286,59 +1182,27 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1363,7 +1227,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1386,7 +1250,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1411,76 +1275,37 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1494,7 +1319,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -1519,27 +1344,61 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -1563,7 +1422,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -1587,7 +1446,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -1611,38 +1470,73 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1651,15 +1545,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1669,7 +1563,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1681,30 +1575,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1713,7 +1607,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1725,10 +1619,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1737,7 +1631,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1749,64 +1643,66 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1815,15 +1711,15 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1833,7 +1729,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1845,30 +1741,30 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1877,7 +1773,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1889,10 +1785,10 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1901,7 +1797,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1913,64 +1809,62 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1979,154 +1873,162 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2135,157 +2037,162 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2294,15 +2201,15 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2311,7 +2218,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2323,30 +2230,30 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2354,7 +2261,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2366,10 +2273,10 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2377,7 +2284,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2389,11 +2296,9 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -2414,324 +2319,212 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v3, v2
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v4
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2739,10 +2532,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
@@ -2752,62 +2545,30 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2815,10 +2576,8 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
@@ -2829,10 +2588,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2840,7 +2599,7 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2852,11 +2611,11 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -2877,86 +2636,723 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_max_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
+}
+
+define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, float %val seq_cst
-  ret void
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64:
+define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
@@ -2965,35 +3361,35 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
@@ -3001,7 +3397,7 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3009,89 +3405,89 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -3099,13 +3495,13 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3113,368 +3509,350 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
 }
 
-define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %result = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64:
+define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
@@ -3482,7 +3860,7 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -3490,34 +3868,34 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3526,41 +3904,41 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -3568,18 +3946,20 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
@@ -3592,375 +3972,466 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:2040
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:2040
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX6-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
+; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[2:3], off offset:-2048
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[2:3], off offset:-2048
+; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
 ; --------------------------------------------------------------------
 ; half
 ; --------------------------------------------------------------------
 
-define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16:
+define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3978,7 +4449,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4000,13 +4471,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -4019,7 +4490,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4035,13 +4506,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -4055,7 +4526,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4077,13 +4548,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -4095,7 +4566,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4112,13 +4583,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -4131,7 +4602,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4146,13 +4617,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -4165,7 +4636,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4180,13 +4651,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -4199,7 +4670,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4215,13 +4686,13 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -4238,7 +4709,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX7-NEXT:    v_not_b32_e32 v7, v2
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -4257,14 +4728,14 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -4281,7 +4752,7 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX6-NEXT:    v_not_b32_e32 v7, v2
-; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -4300,19 +4771,19 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4331,7 +4802,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4353,13 +4824,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -4374,7 +4845,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4390,13 +4861,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4411,7 +4882,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4433,13 +4904,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4452,7 +4923,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4469,13 +4940,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4489,7 +4960,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4504,13 +4975,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4524,7 +4995,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4539,13 +5010,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -4559,7 +5030,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4575,13 +5046,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -4599,7 +5070,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4618,14 +5089,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -4643,7 +5114,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4663,7 +5134,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -4671,12 +5142,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4695,7 +5166,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4717,13 +5188,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -4739,7 +5210,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4755,13 +5226,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -4776,7 +5247,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4798,13 +5269,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -4817,7 +5288,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4834,13 +5305,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4854,7 +5325,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4869,13 +5340,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4889,7 +5360,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4904,13 +5375,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4924,7 +5395,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4940,13 +5411,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -4964,7 +5435,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4983,14 +5454,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -5008,7 +5479,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -5028,7 +5499,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -5036,12 +5507,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
  }
 
-define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16:
+define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5059,7 +5530,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5081,12 +5552,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -5099,7 +5570,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5115,12 +5586,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -5134,7 +5605,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5156,12 +5627,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -5173,7 +5644,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5190,12 +5661,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5208,7 +5679,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5223,12 +5694,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5241,7 +5712,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5256,12 +5727,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5274,7 +5745,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5290,12 +5761,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -5312,7 +5783,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -5331,12 +5802,12 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -5353,7 +5824,7 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -5373,17 +5844,17 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5402,7 +5873,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5424,12 +5895,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -5444,7 +5915,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5460,12 +5931,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -5480,7 +5951,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5502,12 +5973,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -5520,7 +5991,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5537,12 +6008,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -5556,7 +6027,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5571,12 +6042,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -5590,7 +6061,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5605,12 +6076,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -5624,7 +6095,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5640,12 +6111,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -5663,7 +6134,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5682,12 +6153,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -5705,7 +6176,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5725,18 +6196,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5755,7 +6226,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5777,12 +6248,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -5798,7 +6269,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5814,12 +6285,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -5834,7 +6305,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5856,12 +6327,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -5874,7 +6345,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5891,12 +6362,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5910,7 +6381,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5925,12 +6396,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5944,7 +6415,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5959,12 +6430,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5978,7 +6449,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5994,12 +6465,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6017,7 +6488,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6036,12 +6507,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6059,7 +6530,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6079,18 +6550,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6100,7 +6571,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -6119,20 +6590,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -6146,19 +6617,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -6177,19 +6648,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -6205,20 +6676,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -6231,20 +6702,20 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -6257,13 +6728,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6271,7 +6742,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6285,12 +6756,12 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -6301,7 +6772,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6318,13 +6789,13 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -6335,7 +6806,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6353,19 +6824,19 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6375,7 +6846,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
@@ -6394,19 +6865,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6420,18 +6891,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6450,18 +6921,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6477,19 +6948,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6502,19 +6973,19 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6527,12 +6998,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -6540,7 +7011,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6554,12 +7025,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -6570,7 +7041,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6587,12 +7058,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -6603,7 +7074,7 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6621,18 +7092,18 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6651,7 +7122,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -6673,13 +7144,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -6694,7 +7165,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -6710,13 +7181,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6731,7 +7202,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -6753,13 +7224,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6772,7 +7243,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -6789,13 +7260,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6809,7 +7280,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -6826,13 +7297,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6846,7 +7317,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -6861,13 +7332,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6881,7 +7352,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -6897,13 +7368,13 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6921,7 +7392,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6940,14 +7411,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6965,7 +7436,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6985,7 +7456,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -6993,12 +7464,12 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7017,7 +7488,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7039,12 +7510,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7059,7 +7530,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7075,12 +7546,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7095,7 +7566,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7117,12 +7588,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7135,7 +7606,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7152,12 +7623,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7171,7 +7642,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7188,12 +7659,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7207,7 +7678,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7222,12 +7693,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -7241,7 +7712,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7257,12 +7728,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7280,7 +7751,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7299,12 +7770,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7322,7 +7793,7 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7342,13 +7813,13 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -7356,8 +7827,8 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16:
+define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7374,7 +7845,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7403,13 +7874,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -7423,7 +7894,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7445,13 +7916,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -7465,7 +7936,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7494,13 +7965,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -7512,7 +7983,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7533,13 +8004,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -7553,7 +8024,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7572,13 +8043,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -7592,7 +8063,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7611,13 +8082,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -7630,7 +8101,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -7651,13 +8122,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -7674,7 +8145,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -7694,14 +8165,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -7718,7 +8189,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -7738,19 +8209,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7769,7 +8240,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7798,13 +8269,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7820,7 +8291,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7842,13 +8313,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7864,7 +8335,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7893,13 +8364,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7912,7 +8383,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7933,13 +8404,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7954,7 +8425,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7973,13 +8444,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7994,7 +8465,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8013,13 +8484,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -8033,7 +8504,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8054,13 +8525,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8078,7 +8549,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8098,14 +8569,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8123,7 +8594,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8144,7 +8615,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8152,12 +8623,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8176,7 +8647,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8205,13 +8676,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -8228,7 +8699,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8250,13 +8721,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8272,7 +8743,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8301,13 +8772,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8320,7 +8791,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8341,13 +8812,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8362,7 +8833,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8381,13 +8852,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8402,7 +8873,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8421,13 +8892,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8441,7 +8912,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8462,13 +8933,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -8486,7 +8957,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8506,14 +8977,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -8531,7 +9002,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8552,7 +9023,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8560,12 +9031,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
  }
 
-define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16:
+define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8582,7 +9053,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8610,12 +9081,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -8629,7 +9100,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8651,12 +9122,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -8670,7 +9141,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8698,12 +9169,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -8715,7 +9186,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8736,12 +9207,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -8755,7 +9226,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8774,12 +9245,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -8793,7 +9264,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8812,12 +9283,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -8830,7 +9301,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8851,12 +9322,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -8873,7 +9344,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8893,12 +9364,12 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -8915,7 +9386,7 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8936,17 +9407,17 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8965,7 +9436,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8993,12 +9464,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -9014,7 +9485,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9036,12 +9507,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9057,7 +9528,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9085,12 +9556,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9103,7 +9574,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9124,12 +9595,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9144,7 +9615,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9163,12 +9634,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9183,7 +9654,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9202,12 +9673,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -9221,7 +9692,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9242,12 +9713,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9265,7 +9736,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9285,12 +9756,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9308,7 +9779,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9329,18 +9800,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9359,7 +9830,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9387,12 +9858,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -9409,7 +9880,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9431,12 +9902,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9452,7 +9923,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9480,12 +9951,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9498,7 +9969,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9519,12 +9990,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9539,7 +10010,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9558,12 +10029,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9578,7 +10049,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9597,12 +10068,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9616,7 +10087,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9637,12 +10108,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9660,7 +10131,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9680,12 +10151,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9703,7 +10174,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9724,18 +10195,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9745,7 +10216,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -9771,13 +10242,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9785,7 +10256,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -9806,20 +10277,20 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -9845,19 +10316,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -9878,13 +10349,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9892,7 +10363,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -9911,13 +10382,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9925,7 +10396,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -9944,13 +10415,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -9958,7 +10429,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -9978,12 +10449,12 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -9994,7 +10465,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10012,13 +10483,13 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -10029,7 +10500,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10048,19 +10519,19 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10070,7 +10541,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10095,12 +10566,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10108,7 +10579,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10129,19 +10600,19 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10166,18 +10637,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10198,12 +10669,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10211,7 +10682,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10230,12 +10701,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10243,7 +10714,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10262,12 +10733,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -10275,7 +10746,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10295,12 +10766,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -10311,7 +10782,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10329,12 +10800,12 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -10345,7 +10816,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10364,18 +10835,18 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10394,7 +10865,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -10423,13 +10894,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10445,7 +10916,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -10467,13 +10938,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10489,7 +10960,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -10518,13 +10989,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10537,7 +11008,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -10558,13 +11029,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10579,7 +11050,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -10600,13 +11071,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10621,7 +11092,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -10640,13 +11111,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -10660,7 +11131,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -10681,13 +11152,13 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10705,7 +11176,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10725,14 +11196,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10750,7 +11221,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10771,7 +11242,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -10779,12 +11250,12 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10803,7 +11274,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10831,12 +11302,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10852,7 +11323,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10874,12 +11345,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10895,7 +11366,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10923,12 +11394,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10941,7 +11412,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10962,12 +11433,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10982,7 +11453,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11003,12 +11474,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -11023,7 +11494,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11042,12 +11513,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -11061,7 +11532,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11082,12 +11553,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11105,7 +11576,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11125,12 +11596,12 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11148,7 +11619,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11169,13 +11640,13 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -11183,8 +11654,8 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11194,7 +11665,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11210,19 +11681,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11236,19 +11707,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11264,19 +11735,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11290,19 +11761,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11314,19 +11785,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11338,20 +11809,20 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -11366,13 +11837,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11389,7 +11860,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11414,14 +11885,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11438,7 +11909,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11464,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11486,7 +11957,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11502,19 +11973,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11528,19 +11999,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11556,19 +12027,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11582,19 +12053,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11606,19 +12077,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11630,13 +12101,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -11645,7 +12116,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11660,12 +12131,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11682,7 +12153,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11707,14 +12178,14 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11731,7 +12202,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11757,7 +12228,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -11765,12 +12236,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11780,7 +12251,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11796,19 +12267,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11822,19 +12293,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11850,19 +12321,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11876,19 +12347,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11900,19 +12371,19 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11924,13 +12395,13 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -11939,7 +12410,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11954,12 +12425,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -11980,7 +12451,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12005,12 +12476,12 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -12031,7 +12502,7 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12057,18 +12528,18 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16:
+define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12078,7 +12549,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12094,18 +12565,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12119,18 +12590,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12146,18 +12617,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12171,18 +12642,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12194,18 +12665,18 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12217,19 +12688,19 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12244,12 +12715,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12266,7 +12737,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12291,12 +12762,12 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -12313,7 +12784,7 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12339,17 +12810,17 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12359,7 +12830,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12375,18 +12846,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12400,18 +12871,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12427,18 +12898,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12452,18 +12923,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12475,18 +12946,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12498,12 +12969,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -12512,7 +12983,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12527,12 +12998,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12549,7 +13020,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12574,12 +13045,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -12596,7 +13067,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12622,18 +13093,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12643,7 +13114,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12659,18 +13130,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12684,18 +13155,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12711,18 +13182,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12736,18 +13207,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12759,18 +13230,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12782,12 +13253,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -12796,7 +13267,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12811,12 +13282,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -12837,7 +13308,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12862,12 +13333,12 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -12888,7 +13359,7 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12914,18 +13385,18 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12935,7 +13406,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -12951,19 +13422,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -12977,19 +13448,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -13005,19 +13476,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -13031,19 +13502,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -13057,19 +13528,19 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -13081,13 +13552,13 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -13096,7 +13567,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13111,12 +13582,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13133,7 +13604,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13158,14 +13629,14 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13182,7 +13653,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13208,7 +13679,7 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -13216,12 +13687,12 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13231,7 +13702,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -13247,18 +13718,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13272,18 +13743,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13299,18 +13770,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13324,18 +13795,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13349,18 +13820,18 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13372,12 +13843,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -13386,7 +13857,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -13401,12 +13872,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13423,7 +13894,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13448,12 +13919,12 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13470,7 +13941,7 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13496,13 +13967,13 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -13510,8 +13981,8 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13522,7 +13993,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13555,13 +14026,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -13570,7 +14041,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13597,13 +14068,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -13612,7 +14083,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13645,21 +14116,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -13686,13 +14157,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -13701,7 +14172,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13726,13 +14197,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -13741,7 +14212,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13766,20 +14237,20 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -13807,13 +14278,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13829,7 +14300,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13851,14 +14322,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13874,7 +14345,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13897,19 +14368,19 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13920,7 +14391,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13953,13 +14424,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -13968,7 +14439,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13995,13 +14466,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -14010,7 +14481,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -14043,21 +14514,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -14084,13 +14555,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -14099,7 +14570,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -14124,13 +14595,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -14139,7 +14610,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -14164,13 +14635,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -14179,7 +14650,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14207,12 +14678,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14228,7 +14699,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14250,14 +14721,14 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14273,7 +14744,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14296,7 +14767,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -14304,12 +14775,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14320,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -14353,13 +14824,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14368,7 +14839,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -14395,13 +14866,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -14410,7 +14881,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -14443,21 +14914,21 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -14484,13 +14955,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14499,7 +14970,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -14524,13 +14995,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14539,7 +15010,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -14564,13 +15035,13 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -14579,7 +15050,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14607,12 +15078,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -14632,7 +15103,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14654,12 +15125,12 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -14679,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14702,18 +15173,18 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14724,7 +15195,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14756,12 +15227,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -14770,7 +15241,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14797,12 +15268,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -14811,7 +15282,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14843,20 +15314,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14883,12 +15354,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -14897,7 +15368,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14922,12 +15393,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -14936,7 +15407,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14961,19 +15432,19 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15001,12 +15472,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15022,7 +15493,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15044,12 +15515,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15065,7 +15536,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15088,17 +15559,17 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15109,7 +15580,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15141,12 +15612,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15155,7 +15626,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15182,12 +15653,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15196,7 +15667,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15228,20 +15699,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15268,12 +15739,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15282,7 +15753,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15307,12 +15778,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15321,7 +15792,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15346,12 +15817,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -15360,7 +15831,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15388,12 +15859,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15409,7 +15880,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15431,12 +15902,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15452,7 +15923,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15475,18 +15946,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15497,7 +15968,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15529,12 +16000,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15543,7 +16014,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15570,12 +16041,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -15584,7 +16055,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15616,20 +16087,20 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15656,12 +16127,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15670,7 +16141,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15695,12 +16166,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15709,7 +16180,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15734,12 +16205,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -15748,7 +16219,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15776,12 +16247,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -15801,7 +16272,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15823,12 +16294,12 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -15848,7 +16319,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15871,18 +16342,18 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15893,7 +16364,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15926,13 +16397,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15941,7 +16412,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15968,13 +16439,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15983,7 +16454,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -16016,21 +16487,21 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -16057,13 +16528,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16072,7 +16543,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -16099,13 +16570,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16114,7 +16585,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -16139,13 +16610,13 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -16154,7 +16625,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -16182,12 +16653,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16203,7 +16674,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -16225,14 +16696,14 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16248,7 +16719,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -16271,7 +16742,7 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -16279,12 +16750,12 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -16295,7 +16766,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16327,12 +16798,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16341,7 +16812,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16368,12 +16839,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -16382,7 +16853,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16414,20 +16885,20 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16454,12 +16925,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16468,7 +16939,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16495,12 +16966,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16509,7 +16980,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16534,12 +17005,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -16548,7 +17019,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16576,12 +17047,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16597,7 +17068,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16619,12 +17090,12 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16640,7 +17111,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16663,15 +17134,17 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fmax ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 591e01b11bd24..65df8f07fb8b3 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -13,8 +13,8 @@
 ; float
 ; --------------------------------------------------------------------
 
-define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -28,7 +28,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -53,7 +53,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -63,7 +63,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -73,7 +73,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -97,7 +97,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -121,7 +121,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -145,7 +145,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -158,7 +158,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -171,12 +171,12 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -190,7 +190,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -215,7 +215,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -225,7 +225,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -235,7 +235,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -259,7 +259,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -283,7 +283,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -308,7 +308,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -321,7 +321,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -335,12 +335,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -354,7 +354,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -379,7 +379,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -389,7 +389,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -399,7 +399,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -423,7 +423,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -447,7 +447,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -472,7 +472,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -485,7 +485,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -499,12 +499,12 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32:
+define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -518,7 +518,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -542,7 +542,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -552,7 +552,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -562,7 +562,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -585,7 +585,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -608,7 +608,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -631,7 +631,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -643,7 +643,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -655,12 +655,12 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -674,7 +674,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -698,7 +698,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -708,7 +708,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -718,7 +718,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -741,7 +741,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -764,7 +764,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -789,7 +789,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -801,7 +801,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -814,12 +814,12 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -833,7 +833,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -857,7 +857,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -867,7 +867,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -877,7 +877,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -900,7 +900,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -923,7 +923,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -948,7 +948,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -960,7 +960,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -973,44 +973,26 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1035,61 +1017,27 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1115,7 +1063,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1139,7 +1087,7 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1164,105 +1112,53 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
 ; GFX12-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1286,59 +1182,27 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1363,7 +1227,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -1386,7 +1250,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1411,76 +1275,37 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1494,7 +1319,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -1519,27 +1344,61 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -1563,7 +1422,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -1587,7 +1446,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
@@ -1611,38 +1470,73 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
+; GFX7-NEXT:    v_mov_b32_e32 v4, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1651,15 +1545,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1669,7 +1563,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1681,30 +1575,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1713,7 +1607,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1725,10 +1619,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1737,7 +1631,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1749,64 +1643,66 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %result
 }
 
-define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1815,15 +1711,15 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1833,7 +1729,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1845,30 +1741,30 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1877,7 +1773,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1889,10 +1785,10 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
@@ -1901,7 +1797,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
 ; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1913,64 +1809,62 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -1979,154 +1873,162 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2135,157 +2037,162 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
-; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:-2048 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
-  ret void
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
 }
 
-define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -2294,15 +2201,15 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2311,7 +2218,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2323,30 +2230,30 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2354,7 +2261,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2366,10 +2273,10 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
@@ -2377,7 +2284,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2389,11 +2296,9 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -2414,324 +2319,212 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v4, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT:    v_min_num_f32_e32 v3, v3, v2
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
-; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v5, v3
-; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
-; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
-; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
-; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
-; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT:    v_mov_b32_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v5, v3
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT:    v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, v5
-; GFX6-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
-  ret float %result
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT:    v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f32_e32 v2, v2, v4
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_mov_b32_e32 v3, v2
-; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2739,10 +2532,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX940-NEXT:    buffer_wbl2 sc0 sc1
-; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
@@ -2752,62 +2545,30 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT:    v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:-2048
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, v2
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2815,10 +2576,8 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
@@ -2829,10 +2588,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
@@ -2840,7 +2599,7 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
@@ -2852,11 +2611,11 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
@@ -2877,86 +2636,723 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX940-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX90A-NEXT:    v_min_f32_e32 v4, v3, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX908-NEXT:    v_min_f32_e32 v3, v3, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v5, v0, v1
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT:    v_mov_b32_e32 v6, v3
-; GFX7-NEXT:    v_mov_b32_e32 v5, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, v2
-; GFX6-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT:    v_mov_b32_e32 v3, v5
-; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %result = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %result
+}
+
+define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX940-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX908-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
+}
+
+define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    global_wb scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, float %val seq_cst
-  ret void
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %result
 }
 
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64:
+define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
@@ -2965,35 +3361,35 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
@@ -3001,7 +3397,7 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3009,89 +3405,89 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
 ; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v7, v5
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
@@ -3099,13 +3495,13 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
@@ -3113,368 +3509,350 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %result
 }
 
-define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:2040 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
-; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
+; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v7, v5
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
-; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
-; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
-; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
-; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
-; GFX8-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_mov_b32_e32 v0, v2
-; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_mov_b32_e32 v0, v2
-; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %result = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret double %result
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64:
+define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
@@ -3482,7 +3860,7 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -3490,34 +3868,34 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3526,41 +3904,41 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
 ; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
@@ -3568,18 +3946,20 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
@@ -3592,375 +3972,466 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_movk_i32 s4, 0xf800
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s4, s6
-; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:2040
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:2040
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:2040 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
+; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
+; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
+; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0
+; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX6-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v1, v9
+; GFX6-NEXT:    v_mov_b32_e32 v2, v10
+; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
-define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
+; GFX12-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
+; GFX12-NEXT:    v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
 ; GFX12-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc1
-; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
+; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX11-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX11-NEXT:    global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[2:3], off offset:-2048
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[2:3], off offset:-2048
+; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX908-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX908-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX908-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX908-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX908-NEXT:    global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:-2048 glc
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX908-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX908-NEXT:    global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX8-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
-; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX8-NEXT:    v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, v3
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s5, -1
-; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s5, -1
-; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s4, s6
+; GFX6-NEXT:    s_mov_b32 s5, s6
+; GFX6-NEXT:    buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
-  ret void
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+  ret double %result
 }
 
 ; --------------------------------------------------------------------
 ; half
 ; --------------------------------------------------------------------
 
-define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16:
+define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -3978,7 +4449,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4000,13 +4471,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -4019,7 +4490,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4035,13 +4506,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -4055,7 +4526,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4077,13 +4548,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -4095,7 +4566,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4112,13 +4583,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -4131,7 +4602,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4146,13 +4617,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -4165,7 +4636,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4180,13 +4651,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -4199,7 +4670,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4215,13 +4686,13 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -4238,7 +4709,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX7-NEXT:    v_not_b32_e32 v7, v2
-; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -4257,14 +4728,14 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -4281,7 +4752,7 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v5
 ; GFX6-NEXT:    v_not_b32_e32 v7, v2
-; GFX6-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -4300,19 +4771,19 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4331,7 +4802,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4353,13 +4824,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -4374,7 +4845,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4390,13 +4861,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4411,7 +4882,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4433,13 +4904,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4452,7 +4923,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4469,13 +4940,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4489,7 +4960,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4504,13 +4975,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4524,7 +4995,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4539,13 +5010,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -4559,7 +5030,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4575,13 +5046,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -4599,7 +5070,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4618,14 +5089,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -4643,7 +5114,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4663,7 +5134,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -4671,12 +5142,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -4695,7 +5166,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -4717,13 +5188,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -4739,7 +5210,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -4755,13 +5226,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -4776,7 +5247,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -4798,13 +5269,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -4817,7 +5288,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -4834,13 +5305,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4854,7 +5325,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -4869,13 +5340,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4889,7 +5360,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -4904,13 +5375,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -4924,7 +5395,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -4940,13 +5411,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -4964,7 +5435,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -4983,14 +5454,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -5008,7 +5479,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -5028,7 +5499,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -5036,12 +5507,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
  }
 
-define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16:
+define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5059,7 +5530,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5081,12 +5552,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -5099,7 +5570,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    v_not_b32_e32 v6, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5115,12 +5586,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
@@ -5134,7 +5605,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5156,12 +5627,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -5173,7 +5644,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5190,12 +5661,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -5208,7 +5679,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    v_not_b32_e32 v6, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v4, v3, v5
@@ -5223,12 +5694,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -5241,7 +5712,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    v_not_b32_e32 v6, v3
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5256,12 +5727,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -5274,7 +5745,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -5290,12 +5761,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -5312,7 +5783,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
-; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -5331,12 +5802,12 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -5353,7 +5824,7 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
-; GFX6-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v2, v4
@@ -5373,17 +5844,17 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5402,7 +5873,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5424,12 +5895,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -5444,7 +5915,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5460,12 +5931,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -5480,7 +5951,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5502,12 +5973,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -5520,7 +5991,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5537,12 +6008,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -5556,7 +6027,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5571,12 +6042,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -5590,7 +6061,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5605,12 +6076,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -5624,7 +6095,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5640,12 +6111,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -5663,7 +6134,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5682,12 +6153,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -5705,7 +6176,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5725,18 +6196,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -5755,7 +6226,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5777,12 +6248,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -5798,7 +6269,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5814,12 +6285,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -5834,7 +6305,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5856,12 +6327,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -5874,7 +6345,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5891,12 +6362,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5910,7 +6381,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5925,12 +6396,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5944,7 +6415,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5959,12 +6430,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -5978,7 +6449,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -5994,12 +6465,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6017,7 +6488,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6036,12 +6507,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -6059,7 +6530,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -6079,18 +6550,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6100,7 +6571,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -6119,20 +6590,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -6146,19 +6617,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -6177,19 +6648,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -6205,20 +6676,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -6231,20 +6702,20 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -6257,13 +6728,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6271,7 +6742,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v1, v2, v2
-; GFX8-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -6285,12 +6756,12 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -6301,7 +6772,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6318,13 +6789,13 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -6335,7 +6806,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6353,19 +6824,19 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6375,7 +6846,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_max_num_f16_e32 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_max_num_f16_e32 v2, v3, v3
@@ -6394,19 +6865,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX940-NEXT:    s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6420,18 +6891,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6450,18 +6921,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6477,19 +6948,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX90A-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6502,19 +6973,19 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v4, v2, v2
 ; GFX908-NEXT:    s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6527,12 +6998,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -6540,7 +7011,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v4, v2, v2
-; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
@@ -6554,12 +7025,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -6570,7 +7041,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6587,12 +7058,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -6603,7 +7074,7 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX6-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
@@ -6621,18 +7092,18 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -6651,7 +7122,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -6673,13 +7144,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -6694,7 +7165,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_not_b32_e32 v4, v4
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -6710,13 +7181,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6731,7 +7202,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
-; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -6753,13 +7224,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -6772,7 +7243,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -6789,13 +7260,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6809,7 +7280,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_not_b32_e32 v4, v4
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -6826,13 +7297,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -6846,7 +7317,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_not_b32_e32 v4, v4
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -6861,13 +7332,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -6881,7 +7352,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -6897,13 +7368,13 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6921,7 +7392,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX7-NEXT:    v_not_b32_e32 v8, v2
-; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6940,14 +7411,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -6965,7 +7436,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v7, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v6
 ; GFX6-NEXT:    v_not_b32_e32 v8, v2
-; GFX6-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -6985,7 +7456,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -6993,12 +7464,12 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret half %result
 }
 
-define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7017,7 +7488,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7039,12 +7510,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7059,7 +7530,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_not_b32_e32 v5, v5
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7075,12 +7546,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7095,7 +7566,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
-; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7117,12 +7588,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -7135,7 +7606,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7152,12 +7623,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7171,7 +7642,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_not_b32_e32 v5, v5
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7188,12 +7659,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -7207,7 +7678,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_not_b32_e32 v5, v5
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7222,12 +7693,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -7241,7 +7712,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_e32 v6, v2, v2
-; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7257,12 +7728,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7280,7 +7751,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX7-NEXT:    v_not_b32_e32 v6, v2
-; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7299,12 +7770,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -7322,7 +7793,7 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v2
-; GFX6-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -7342,13 +7813,13 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -7356,8 +7827,8 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
 ; bfloat
 ; --------------------------------------------------------------------
 
-define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16:
+define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7374,7 +7845,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7403,13 +7874,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -7423,7 +7894,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7445,13 +7916,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -7465,7 +7936,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7494,13 +7965,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -7512,7 +7983,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7533,13 +8004,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -7553,7 +8024,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7572,13 +8043,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -7592,7 +8063,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -7611,13 +8082,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -7630,7 +8101,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -7651,13 +8122,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -7674,7 +8145,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -7694,14 +8165,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -7718,7 +8189,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; GFX6-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v5, v4
@@ -7738,19 +8209,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -7769,7 +8240,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -7798,13 +8269,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -7820,7 +8291,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -7842,13 +8313,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7864,7 +8335,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -7893,13 +8364,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -7912,7 +8383,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -7933,13 +8404,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7954,7 +8425,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -7973,13 +8444,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -7994,7 +8465,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8013,13 +8484,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -8033,7 +8504,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8054,13 +8525,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8078,7 +8549,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8098,14 +8569,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -8123,7 +8594,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8144,7 +8615,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8152,12 +8623,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8176,7 +8647,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -8205,13 +8676,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -8228,7 +8699,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -8250,13 +8721,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8272,7 +8743,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -8301,13 +8772,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -8320,7 +8791,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -8341,13 +8812,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8362,7 +8833,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -8381,13 +8852,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8402,7 +8873,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -8421,13 +8892,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -8441,7 +8912,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -8462,13 +8933,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -8486,7 +8957,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8506,14 +8977,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -8531,7 +9002,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -8552,7 +9023,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -8560,12 +9031,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
  }
 
-define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16:
+define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8582,7 +9053,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v6, v3
-; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8610,12 +9081,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v0
@@ -8629,7 +9100,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8651,12 +9122,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
@@ -8670,7 +9141,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v6, v3
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8698,12 +9169,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v0
@@ -8715,7 +9186,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v5, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8736,12 +9207,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
@@ -8755,7 +9226,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8774,12 +9245,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v0
@@ -8793,7 +9264,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8812,12 +9283,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v0
@@ -8830,7 +9301,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    v_not_b32_e32 v6, v3
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -8851,12 +9322,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v0
@@ -8873,7 +9344,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    v_not_b32_e32 v6, v3
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8893,12 +9364,12 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v0
@@ -8915,7 +9386,7 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
@@ -8936,17 +9407,17 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -8965,7 +9436,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -8993,12 +9464,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -9014,7 +9485,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9036,12 +9507,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9057,7 +9528,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9085,12 +9556,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -9103,7 +9574,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9124,12 +9595,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9144,7 +9615,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9163,12 +9634,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -9183,7 +9654,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9202,12 +9673,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -9221,7 +9692,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9242,12 +9713,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9265,7 +9736,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9285,12 +9756,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -9308,7 +9779,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB36_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9329,18 +9800,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB36_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9359,7 +9830,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9387,12 +9858,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_movk_i32 s0, 0xf800
@@ -9409,7 +9880,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9431,12 +9902,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9452,7 +9923,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9480,12 +9951,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
@@ -9498,7 +9969,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9519,12 +9990,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9539,7 +10010,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9558,12 +10029,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9578,7 +10049,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9597,12 +10068,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xfffff800, v0
@@ -9616,7 +10087,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -9637,12 +10108,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9660,7 +10131,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9680,12 +10151,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
@@ -9703,7 +10174,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB37_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -9724,18 +10195,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB37_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -9745,7 +10216,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -9771,13 +10242,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9785,7 +10256,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -9806,20 +10277,20 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -9845,19 +10316,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -9878,13 +10349,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9892,7 +10363,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -9911,13 +10382,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -9925,7 +10396,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -9944,13 +10415,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -9958,7 +10429,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -9978,12 +10449,12 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -9994,7 +10465,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10012,13 +10483,13 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -10029,7 +10500,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10048,19 +10519,19 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10070,7 +10541,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10095,12 +10566,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10108,7 +10579,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
 ; GFX940-NEXT:    s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10129,19 +10600,19 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2046
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10166,18 +10637,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2046
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10198,12 +10669,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10211,7 +10682,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX90A-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10230,12 +10701,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2046
@@ -10243,7 +10714,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
 ; GFX908-NEXT:    s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10262,12 +10733,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fe, v0
@@ -10275,7 +10746,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10295,12 +10766,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -10311,7 +10782,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10329,12 +10800,12 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -10345,7 +10816,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -10364,18 +10835,18 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10394,7 +10865,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v4, v4
-; GFX12-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v5
@@ -10423,13 +10894,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10445,7 +10916,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v5
@@ -10467,13 +10938,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10489,7 +10960,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v5
@@ -10518,13 +10989,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -10537,7 +11008,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
-; GFX10-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v5
@@ -10558,13 +11029,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v6
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10579,7 +11050,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v5
@@ -10600,13 +11071,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -10621,7 +11092,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v5
@@ -10640,13 +11111,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fe, v0
@@ -10660,7 +11131,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v5
@@ -10681,13 +11152,13 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v3, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10705,7 +11176,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v7, v4
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10725,14 +11196,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -10750,7 +11221,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v7, v4
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
@@ -10771,7 +11242,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v6, v4
@@ -10779,12 +11250,12 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret bfloat %result
 }
 
-define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -10803,7 +11274,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_not_b32_e32 v5, v5
-; GFX12-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10831,12 +11302,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0x7fe
@@ -10852,7 +11323,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX940-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10874,12 +11345,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10895,7 +11366,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v5
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -10923,12 +11394,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x7fe, v0
@@ -10941,7 +11412,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, 0xffff
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
-; GFX10-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -10962,12 +11433,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -10982,7 +11453,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11003,12 +11474,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, 0x7fe, v0
@@ -11023,7 +11494,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX908-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11042,12 +11513,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fe, v0
@@ -11061,7 +11532,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    v_not_b32_e32 v5, v5
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -11082,12 +11553,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11105,7 +11576,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_not_b32_e32 v5, v5
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11125,12 +11596,12 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 0x7fe, v0
@@ -11148,7 +11619,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_not_b32_e32 v5, v5
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX6-NEXT:  .LBB41_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v3
@@ -11169,13 +11640,13 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB41_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -11183,8 +11654,8 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
 ; <2 x half>
 ; --------------------------------------------------------------------
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11194,7 +11665,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11210,19 +11681,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11236,19 +11707,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11264,19 +11735,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11290,19 +11761,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11314,19 +11785,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11338,20 +11809,20 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -11366,13 +11837,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11389,7 +11860,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11414,14 +11885,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11438,7 +11909,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB42_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11464,19 +11935,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB42_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11486,7 +11957,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11502,19 +11973,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11528,19 +11999,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11556,19 +12027,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11582,19 +12053,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11606,19 +12077,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11630,13 +12101,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -11645,7 +12116,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11660,12 +12131,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -11682,7 +12153,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11707,14 +12178,14 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -11731,7 +12202,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB43_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -11757,7 +12228,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB43_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -11765,12 +12236,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -11780,7 +12251,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -11796,19 +12267,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -11822,19 +12293,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -11850,19 +12321,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -11876,19 +12347,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -11900,19 +12371,19 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -11924,13 +12395,13 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -11939,7 +12410,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -11954,12 +12425,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -11980,7 +12451,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12005,12 +12476,12 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -12031,7 +12502,7 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v6
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:  .LBB44_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -12057,18 +12528,18 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB44_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16:
+define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12078,7 +12549,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12094,18 +12565,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12119,18 +12590,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12146,18 +12617,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12171,18 +12642,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12194,18 +12665,18 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12217,19 +12688,19 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12244,12 +12715,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12266,7 +12737,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12291,12 +12762,12 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -12313,7 +12784,7 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB45_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12339,17 +12810,17 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB45_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12359,7 +12830,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12375,18 +12846,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12400,18 +12871,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12427,18 +12898,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12452,18 +12923,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12475,18 +12946,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12498,12 +12969,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -12512,7 +12983,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12527,12 +12998,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -12549,7 +13020,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12574,12 +13045,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -12596,7 +13067,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB46_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12622,18 +13093,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB46_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12643,7 +13114,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -12659,18 +13130,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12684,18 +13155,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12711,18 +13182,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12736,18 +13207,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12759,18 +13230,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -12782,12 +13253,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -12796,7 +13267,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -12811,12 +13282,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -12837,7 +13308,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12862,12 +13333,12 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -12888,7 +13359,7 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB47_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -12914,18 +13385,18 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB47_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -12935,7 +13406,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v4, v3
@@ -12951,19 +13422,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX940-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v3
@@ -12977,19 +13448,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v3
@@ -13005,19 +13476,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -13031,19 +13502,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX90A-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
@@ -13057,19 +13528,19 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX908-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v3
@@ -13081,13 +13552,13 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -13096,7 +13567,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -13111,12 +13582,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13133,7 +13604,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13158,14 +13629,14 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13182,7 +13653,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v6
-; GFX6-NEXT:  .LBB48_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -13208,7 +13679,7 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB48_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v2
@@ -13216,12 +13687,12 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x half> %result
 }
 
-define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13231,7 +13702,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v2, v2
 ; GFX12-NEXT:    s_mov_b32 s0, 0
-; GFX12-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v3, v3
@@ -13247,18 +13718,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX940-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX940-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX940-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13272,18 +13743,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
 ; GFX11-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13299,18 +13770,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_pk_max_f16 v4, v2, v2
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13324,18 +13795,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13349,18 +13820,18 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v4, v2, v2
-; GFX908-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_max_f16 v2, v3, v3
@@ -13372,12 +13843,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -13386,7 +13857,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v5, v2, v2
-; GFX8-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -13401,12 +13872,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13423,7 +13894,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13448,12 +13919,12 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13470,7 +13941,7 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v5
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT:  .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -13496,13 +13967,13 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB49_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
@@ -13510,8 +13981,8 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
 ; <2 x bfloat>
 ; --------------------------------------------------------------------
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13522,7 +13993,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13555,13 +14026,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -13570,7 +14041,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13597,13 +14068,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -13612,7 +14083,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -13645,21 +14116,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -13686,13 +14157,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -13701,7 +14172,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -13726,13 +14197,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -13741,7 +14212,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -13766,20 +14237,20 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v3
@@ -13807,13 +14278,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -13829,7 +14300,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13851,14 +14322,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -13874,7 +14345,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -13897,19 +14368,19 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -13920,7 +14391,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -13953,13 +14424,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -13968,7 +14439,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -13995,13 +14466,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -14010,7 +14481,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -14043,21 +14514,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -14084,13 +14555,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -14099,7 +14570,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -14124,13 +14595,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -14139,7 +14610,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -14164,13 +14635,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -14179,7 +14650,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14207,12 +14678,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -14228,7 +14699,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14250,14 +14721,14 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -14273,7 +14744,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -14296,7 +14767,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -14304,12 +14775,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14320,7 +14791,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -14353,13 +14824,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14368,7 +14839,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -14395,13 +14866,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -14410,7 +14881,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -14443,21 +14914,21 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -14484,13 +14955,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14499,7 +14970,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -14524,13 +14995,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -14539,7 +15010,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -14564,13 +15035,13 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -14579,7 +15050,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -14607,12 +15078,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -14632,7 +15103,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14654,12 +15125,12 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -14679,7 +15150,7 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -14702,18 +15173,18 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -14724,7 +15195,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14756,12 +15227,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off
@@ -14770,7 +15241,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14797,12 +15268,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -14811,7 +15282,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14843,20 +15314,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14883,12 +15354,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -14897,7 +15368,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14922,12 +15393,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -14936,7 +15407,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -14961,19 +15432,19 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15001,12 +15472,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15022,7 +15493,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15044,12 +15515,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15065,7 +15536,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15088,17 +15559,17 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15109,7 +15580,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15141,12 +15612,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15155,7 +15626,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15182,12 +15653,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15196,7 +15667,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15228,20 +15699,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15268,12 +15739,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15282,7 +15753,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15307,12 +15778,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15321,7 +15792,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15346,12 +15817,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -15360,7 +15831,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15388,12 +15859,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -15409,7 +15880,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15431,12 +15902,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -15452,7 +15923,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB58_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15475,18 +15946,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15497,7 +15968,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15529,12 +16000,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15543,7 +16014,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15570,12 +16041,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-2048
@@ -15584,7 +16055,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15616,20 +16087,20 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15656,12 +16127,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15670,7 +16141,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15695,12 +16166,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-2048
@@ -15709,7 +16180,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15734,12 +16205,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -15748,7 +16219,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -15776,12 +16247,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
@@ -15801,7 +16272,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15823,12 +16294,12 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
@@ -15848,7 +16319,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -15871,18 +16342,18 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB59_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -15893,7 +16364,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, v3
@@ -15926,13 +16397,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -15941,7 +16412,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v7, v3
@@ -15968,13 +16439,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -15983,7 +16454,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v3
@@ -16016,21 +16487,21 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v3
@@ -16057,13 +16528,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16072,7 +16543,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
@@ -16099,13 +16570,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16114,7 +16585,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v3
@@ -16139,13 +16610,13 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -16154,7 +16625,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v0
@@ -16182,12 +16653,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16203,7 +16674,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -16225,14 +16696,14 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16248,7 +16719,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB60_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -16271,7 +16742,7 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB60_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
@@ -16279,12 +16750,12 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %result = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret <2 x bfloat> %result
 }
 
-define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    s_wait_expcnt 0x0
@@ -16295,7 +16766,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX12-NEXT:    s_mov_b32 s1, 0
-; GFX12-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16327,12 +16798,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX12-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16341,7 +16812,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
-; GFX940-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16368,12 +16839,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX940-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:2044
@@ -16382,7 +16853,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16414,20 +16885,20 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:2044
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16454,12 +16925,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16468,7 +16939,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16495,12 +16966,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:2044
@@ -16509,7 +16980,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16534,12 +17005,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -16548,7 +17019,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -16576,12 +17047,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -16597,7 +17068,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16619,12 +17090,12 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -16640,7 +17111,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT:  .LBB61_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -16663,15 +17134,17 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX6-NEXT:    s_cbranch_execnz .LBB61_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX6-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
-  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  %unused = atomicrmw fmin ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 46f0bb0393885..a841038828963 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS-ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
@@ -145,39 +145,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB0_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX7LESS-DPP-NEXT:  .LBB0_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -278,81 +245,81 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 }
 
 define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_mov_b32 s39, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s36, s36, s9
-; GFX7LESS-NEXT:    s_addc_u32 s37, s37, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s6
-; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX7LESS-NEXT:  .LBB1_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
-; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f32_e32 v2, v1, v2
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB1_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7LESS-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX7LESS-NEXT:  .LBB1_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB1_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB1_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:  .LBB1_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB1_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
@@ -1148,55 +1115,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1064-NEXT:  .LBB2_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB2_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1032-NEXT:  .LBB2_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB2_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
@@ -1206,95 +1147,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1164-NEXT:  .LBB2_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB2_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1132-NEXT:  .LBB2_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB2_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB2_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX7LESS-DPP-NEXT:  .LBB2_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1330,55 +1211,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1064-DPP-NEXT:  .LBB2_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB2_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1032-DPP-NEXT:  .LBB2_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB2_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
@@ -1388,60 +1243,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1164-DPP-NEXT:  .LBB2_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB2_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1132-DPP-NEXT:  .LBB2_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB2_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
   ret void
@@ -1449,81 +1277,81 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 
 
 define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_mov_b32 s39, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s36, s36, s9
-; GFX7LESS-NEXT:    s_addc_u32 s37, s37, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s6
-; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX7LESS-NEXT:  .LBB3_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
-; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f32_e32 v2, v1, v2
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB3_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB3_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7LESS-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX7LESS-NEXT:  .LBB3_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB3_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB3_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:  .LBB3_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB3_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
 ; GFX9:       ; %bb.0:
@@ -1623,18 +1451,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1642,27 +1470,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1064-NEXT:  .LBB3_5:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB3_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -1693,45 +1507,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1032-NEXT:  .LBB3_5:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB3_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -1752,13 +1552,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -1766,7 +1566,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1775,29 +1575,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1164-NEXT:  .LBB3_5:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB3_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -1818,13 +1602,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1832,36 +1616,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1132-NEXT:  .LBB3_5:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB3_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -2080,27 +1849,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1064-DPP-NEXT:  .LBB3_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB3_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -2162,29 +1917,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1032-DPP-NEXT:  .LBB3_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB3_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -2252,34 +1993,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1164-DPP-NEXT:  .LBB3_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB3_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
@@ -2335,34 +2060,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1132-DPP-NEXT:  .LBB3_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB3_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
@@ -2439,55 +2148,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1064-NEXT:  .LBB4_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB4_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1032-NEXT:  .LBB4_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB4_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
@@ -2497,95 +2180,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1164-NEXT:  .LBB4_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB4_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1132-NEXT:  .LBB4_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB4_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB4_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX7LESS-DPP-NEXT:  .LBB4_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2621,55 +2244,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1064-DPP-NEXT:  .LBB4_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB4_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1032-DPP-NEXT:  .LBB4_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB4_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
@@ -2679,141 +2276,114 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1164-DPP-NEXT:  .LBB4_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB4_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1132-DPP-NEXT:  .LBB4_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB4_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
   ret void
 }
 
 define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_mov_b32 s39, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s36, s36, s9
-; GFX7LESS-NEXT:    s_addc_u32 s37, s37, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s6
-; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX7LESS-NEXT:  .LBB5_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
-; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f32_e32 v2, v1, v2
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB5_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7LESS-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX7LESS-NEXT:  .LBB5_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB5_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB5_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
@@ -2913,18 +2483,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2932,27 +2502,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1064-NEXT:  .LBB5_5:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB5_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -2983,45 +2539,31 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1032-NEXT:  .LBB5_5:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB5_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3042,13 +2584,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -3056,7 +2598,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3065,29 +2607,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1164-NEXT:  .LBB5_5:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB5_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3108,13 +2634,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3122,36 +2648,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1132-NEXT:  .LBB5_5:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB5_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3370,27 +2881,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1064-DPP-NEXT:  .LBB5_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB5_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3452,29 +2949,15 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1032-DPP-NEXT:  .LBB5_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB5_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3542,34 +3025,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1164-DPP-NEXT:  .LBB5_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB5_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
@@ -3625,34 +3092,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1132-DPP-NEXT:  .LBB5_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB5_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4
@@ -3662,1027 +3113,426 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
 ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[44:45], 0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7LESS-NEXT:    v_or_b32_e32 v40, v3, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s42
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s43
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX7LESS-NEXT:  .LBB6_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_mov_b32 s40, s7
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX9-NEXT:    s_mov_b32 s41, s6
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-NEXT:  .LBB6_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-NEXT:    s_add_u32 s48, s48, s9
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT:    s_movk_i32 s32, 0x800
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s33, s8
-; GFX1064-NEXT:    s_mov_b32 s40, s7
-; GFX1064-NEXT:    s_mov_b32 s41, s6
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1064-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT:    s_add_u32 s8, s36, 44
-; GFX1064-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1064-NEXT:  .LBB6_3:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB6_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s44, 0
-; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s33, s8
-; GFX1032-NEXT:    s_mov_b32 s40, s7
-; GFX1032-NEXT:    s_mov_b32 s41, s6
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1032-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-NEXT:    s_add_u32 s8, s36, 44
-; GFX1032-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1032-NEXT:  .LBB6_3:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB6_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1164-NEXT:    s_mov_b32 s33, s8
-; GFX1164-NEXT:    s_mov_b32 s40, s7
-; GFX1164-NEXT:    s_mov_b32 s41, s6
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-NEXT:    s_add_u32 s8, s36, 44
-; GFX1164-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s41
-; GFX1164-NEXT:    s_mov_b32 s13, s40
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-NEXT:  .LBB6_3:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s44, 0
-; GFX1132-NEXT:    s_mov_b32 s32, 32
+; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s40, s14
-; GFX1132-NEXT:    s_mov_b32 s41, s13
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[36:37], s[2:3]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX1132-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-NEXT:    s_add_u32 s8, s36, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s41
-; GFX1132-NEXT:    s_mov_b32 s13, s40
-; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-NEXT:  .LBB6_3:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB6_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v3, v2
-; GFX7LESS-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX7LESS-DPP-NEXT:  .LBB6_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX9-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX9-DPP-NEXT:    s_mov_b32 s40, s7
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX9-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1064-DPP-NEXT:  .LBB6_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB6_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1032-DPP-NEXT:  .LBB6_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB6_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT:    .p2align 6
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-DPP-NEXT:  .LBB6_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT:    .p2align 6
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX1132-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-DPP-NEXT:  .LBB6_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8
   ret void
 }
 
 define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX7LESS-NEXT:  .LBB7_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
-; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB7_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[42:43], 0
-; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX7LESS-NEXT:  .LBB7_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[41:42]
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_4
-; GFX7LESS-NEXT:  .LBB7_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB7_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB7_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:  .LBB7_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB7_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s40, s7
-; GFX9-NEXT:    s_mov_b32 s41, s6
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX9-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4692,103 +3542,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[42:43], s[36:37], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[42:43]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
 ; GFX9-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX9-NEXT:  .LBB7_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s9
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s33, s8
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_mov_b32 s40, s7
-; GFX1064-NEXT:    s_mov_b32 s41, s6
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_movk_i32 s32, 0x800
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4796,92 +3613,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB7_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1064-NEXT:  .LBB7_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB7_4
-; GFX1064-NEXT:  .LBB7_5:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT:  .LBB7_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s9
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s33, s8
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_mov_b32 s40, s7
-; GFX1032-NEXT:    s_mov_b32 s41, s6
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
@@ -4899,93 +3667,44 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB7_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[42:43]
-; GFX1032-NEXT:  .LBB7_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-NEXT:    s_cbranch_execnz .LBB7_4
-; GFX1032-NEXT:  .LBB7_5:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT:  .LBB7_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT:    s_mov_b32 s33, s8
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s12, s6
 ; GFX1164-NEXT:    s_mov_b32 s13, s7
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    s_mov_b32 s32, 32
-; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
-; GFX1164-NEXT:    s_mov_b32 s40, s7
-; GFX1164-NEXT:    s_mov_b32 s41, s6
+; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1164-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1164-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -4994,7 +3713,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5005,87 +3724,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[42:43]
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
+; GFX1164-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1164-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s41
-; GFX1164-NEXT:    s_mov_b32 s13, s40
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[4:5], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1164-NEXT:  .LBB7_5:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT:    s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT:    s_mov_b32 s40, s14
-; GFX1132-NEXT:    s_mov_b32 s41, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
-; GFX1132-NEXT:    s_mov_b32 s32, 32
-; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
@@ -5094,837 +3782,592 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s44, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[42:43]
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
+; GFX1132-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1132-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1132-NEXT:    s_add_u32 s8, s34, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s41
-; GFX1132-NEXT:    s_mov_b32 s13, s40
-; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[4:5], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1132-NEXT:  .LBB7_5:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[42:43], 0
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[42:43]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7LESS-DPP-NEXT:    s_endpgm
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX9-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s43, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s42, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s2, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[11:12], v0, s[2:3]
 ; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[42:43], s[42:43]
+; GFX9-DPP-NEXT:    v_max_f64 v[1:2], s[0:1], s[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT:    v_max_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT:    global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s5, v9, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s4, v8, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], s[4:5], s[4:5]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], s[2:3], s[2:3]
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1064-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_2
-; GFX1064-DPP-NEXT:  .LBB7_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB7_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1032-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_2
-; GFX1032-DPP-NEXT:  .LBB7_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB7_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v10, v8
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[42:43]
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT:    .p2align 6
+; GFX1164-DPP-NEXT:    global_load_b64 v[10:11], v12, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1164-DPP-NEXT:  .LBB7_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[42:43]
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT:    .p2align 6
+; GFX1132-DPP-NEXT:    global_load_b64 v[10:11], v12, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1132-DPP-NEXT:  .LBB7_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call double @div.double.value()
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
   ret void
 }
 
@@ -6003,59 +4446,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1064-NEXT:  .LBB8_3:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB8_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1032-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1032-NEXT:  .LBB8_3:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB8_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
@@ -6123,43 +4538,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:  .LBB8_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB8_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX7LESS-DPP-NEXT:  .LBB8_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6197,59 +4575,31 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1064-DPP-NEXT:  .LBB8_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB8_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1032-DPP-NEXT:  .LBB8_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB8_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
@@ -6321,86 +4671,86 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 }
 
 define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_mov_b32 s39, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s36, s36, s9
-; GFX7LESS-NEXT:    s_addc_u32 s37, s37, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s6
-; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
-; GFX7LESS-NEXT:  .LBB9_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
-; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB9_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX7LESS-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX7LESS-NEXT:  .LBB9_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB9_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB9_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:  .LBB9_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB9_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB9_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
 ; GFX9:       ; %bb.0:
@@ -6503,20 +4853,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6524,28 +4874,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB9_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX1064-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX1064-NEXT:  .LBB9_5:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT:  .LBB9_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -6576,48 +4911,33 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB9_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX1032-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX1032-NEXT:  .LBB9_5:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT:  .LBB9_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7008,28 +5328,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[11:12], v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[9:10], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[13:14], v[11:12], v[11:12]
-; GFX1064-DPP-NEXT:    v_max_f64 v[9:10], v[13:14], v[9:10]
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v10
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB9_2
-; GFX1064-DPP-NEXT:  .LBB9_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB9_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7104,30 +5409,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[11:12], v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[9:10], v[11:12], v[11:12]
-; GFX1032-DPP-NEXT:    v_max_f64 v[9:10], v[9:10], v[0:1]
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v10
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB9_2
-; GFX1032-DPP-NEXT:  .LBB9_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB9_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7345,1027 +5635,426 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
 ; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[44:45], 0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7LESS-NEXT:    v_or_b32_e32 v40, v3, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s42
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s43
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX7LESS-NEXT:  .LBB10_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_mov_b32 s40, s7
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX9-NEXT:    s_mov_b32 s41, s6
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-NEXT:  .LBB10_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-NEXT:    s_add_u32 s48, s48, s9
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT:    s_movk_i32 s32, 0x800
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s33, s8
-; GFX1064-NEXT:    s_mov_b32 s40, s7
-; GFX1064-NEXT:    s_mov_b32 s41, s6
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1064-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT:    s_add_u32 s8, s36, 44
-; GFX1064-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1064-NEXT:  .LBB10_3:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB10_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s44, 0
-; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s33, s8
-; GFX1032-NEXT:    s_mov_b32 s40, s7
-; GFX1032-NEXT:    s_mov_b32 s41, s6
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1032-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-NEXT:    s_add_u32 s8, s36, 44
-; GFX1032-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1032-NEXT:  .LBB10_3:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB10_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1164-NEXT:    s_mov_b32 s33, s8
-; GFX1164-NEXT:    s_mov_b32 s40, s7
-; GFX1164-NEXT:    s_mov_b32 s41, s6
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-NEXT:    s_add_u32 s8, s36, 44
-; GFX1164-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s41
-; GFX1164-NEXT:    s_mov_b32 s13, s40
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-NEXT:  .LBB10_3:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s44, 0
-; GFX1132-NEXT:    s_mov_b32 s32, 32
+; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s40, s14
-; GFX1132-NEXT:    s_mov_b32 s41, s13
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[36:37], s[2:3]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX1132-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-NEXT:    s_add_u32 s8, s36, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s41
-; GFX1132-NEXT:    s_mov_b32 s13, s40
-; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-NEXT:  .LBB10_3:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB10_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v3, v2
-; GFX7LESS-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX7LESS-DPP-NEXT:  .LBB10_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX9-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX9-DPP-NEXT:    s_mov_b32 s40, s7
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1064-DPP-NEXT:  .LBB10_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB10_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1032-DPP-NEXT:  .LBB10_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB10_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT:    .p2align 6
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-DPP-NEXT:  .LBB10_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT:    .p2align 6
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX1132-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], 4.0
-; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1132-DPP-NEXT:  .LBB10_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
-  ret void
-}
-
-define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX7LESS-NEXT:  .LBB11_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
-; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB11_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[42:43], 0
-; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX7LESS-NEXT:  .LBB11_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[41:42]
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7LESS-NEXT:  .LBB11_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
+; GFX1132-DPP-NEXT:  .LBB10_3:
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 8
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB11_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB11_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:  .LBB11_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB11_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB11_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s40, s7
-; GFX9-NEXT:    s_mov_b32 s41, s6
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX9-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8375,103 +6064,70 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[42:43], s[36:37], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[42:43]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
 ; GFX9-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX9-NEXT:  .LBB11_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s9
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s33, s8
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_mov_b32 s40, s7
-; GFX1064-NEXT:    s_mov_b32 s41, s6
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_movk_i32 s32, 0x800
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8479,92 +6135,43 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB11_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1064-NEXT:  .LBB11_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX1064-NEXT:  .LBB11_5:
+; GFX1064-NEXT:    global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT:  .LBB11_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s9
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s33, s8
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_mov_b32 s40, s7
-; GFX1032-NEXT:    s_mov_b32 s41, s6
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
@@ -8582,93 +6189,44 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB11_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[42:43]
-; GFX1032-NEXT:  .LBB11_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX1032-NEXT:  .LBB11_5:
+; GFX1032-NEXT:    global_atomic_fmax_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT:  .LBB11_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT:    s_mov_b32 s33, s8
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s12, s6
 ; GFX1164-NEXT:    s_mov_b32 s13, s7
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    s_mov_b32 s32, 32
-; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
-; GFX1164-NEXT:    s_mov_b32 s40, s7
-; GFX1164-NEXT:    s_mov_b32 s41, s6
+; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1164-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1164-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -8677,7 +6235,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8688,87 +6246,56 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[42:43]
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
+; GFX1164-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1164-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s41
-; GFX1164-NEXT:    s_mov_b32 s13, s40
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[4:5], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1164-NEXT:  .LBB11_5:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT:    s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT:    s_mov_b32 s40, s14
-; GFX1132-NEXT:    s_mov_b32 s41, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
-; GFX1132-NEXT:    s_mov_b32 s32, 32
-; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
@@ -8777,837 +6304,592 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s44, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[42:43]
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
+; GFX1132-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1132-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1132-NEXT:    s_add_u32 s8, s34, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s41
-; GFX1132-NEXT:    s_mov_b32 s13, s40
-; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[4:5], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1132-NEXT:  .LBB11_5:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[42:43], 0
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[41:42]
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX9-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s43, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s42, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s2, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[11:12], v0, s[2:3]
 ; GFX9-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[42:43], s[42:43]
+; GFX9-DPP-NEXT:    v_max_f64 v[1:2], s[0:1], s[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT:    v_max_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT:    global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX9-DPP-NEXT:  .LBB11_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s5, v9, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s4, v8, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], s[4:5], s[4:5]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], s[2:3], s[2:3]
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB11_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1064-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB11_2
-; GFX1064-DPP-NEXT:  .LBB11_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB11_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1032-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB11_2
-; GFX1032-DPP-NEXT:  .LBB11_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB11_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v10, v8
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[42:43]
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT:    .p2align 6
+; GFX1164-DPP-NEXT:    global_load_b64 v[10:11], v12, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1164-DPP-NEXT:  .LBB11_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[10:11]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[42:43]
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT:    .p2align 6
+; GFX1132-DPP-NEXT:    global_load_b64 v[10:11], v12, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[41:42]
-; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1132-DPP-NEXT:  .LBB11_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call double @div.double.value()
-  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+  %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8
   ret void
 }
 
@@ -9680,55 +6962,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1064-NEXT:  .LBB12_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB12_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1032-NEXT:  .LBB12_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB12_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9738,95 +6994,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1164-NEXT:  .LBB12_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB12_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1132-NEXT:  .LBB12_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB12_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB12_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX7LESS-DPP-NEXT:  .LBB12_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9862,55 +7058,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1064-DPP-NEXT:  .LBB12_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB12_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1032-DPP-NEXT:  .LBB12_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB12_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9920,60 +7090,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1164-DPP-NEXT:  .LBB12_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB12_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1132-DPP-NEXT:  .LBB12_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB12_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
@@ -10048,55 +7191,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1064-NEXT:  .LBB13_3:
+; GFX1064-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB13_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1032-NEXT:  .LBB13_3:
+; GFX1032-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB13_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10106,95 +7223,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-NEXT:  .LBB13_3:
+; GFX1164-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB13_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-NEXT:  .LBB13_3:
+; GFX1132-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB13_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB13_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX7LESS-DPP-NEXT:  .LBB13_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10230,55 +7287,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1064-DPP-NEXT:  .LBB13_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB13_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1032-DPP-NEXT:  .LBB13_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB13_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10288,60 +7319,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-DPP-NEXT:  .LBB13_3:
+; GFX1164-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB13_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-DPP-NEXT:  .LBB13_3:
+; GFX1132-DPP-NEXT:    global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB13_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index bd5e589ec2be7..3dac8222d65c5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS-ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
@@ -145,39 +145,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB0_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX7LESS-DPP-NEXT:  .LBB0_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -278,81 +245,81 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 }
 
 define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_mov_b32 s39, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s36, s36, s9
-; GFX7LESS-NEXT:    s_addc_u32 s37, s37, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s6
-; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX7LESS-NEXT:  .LBB1_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
-; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f32_e32 v2, v1, v2
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB1_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7LESS-NEXT:  .LBB1_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB1_4
-; GFX7LESS-NEXT:  .LBB1_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB1_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB1_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:  .LBB1_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB1_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
@@ -1148,55 +1115,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1064-NEXT:  .LBB2_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB2_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1032-NEXT:  .LBB2_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB2_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
@@ -1206,95 +1147,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1164-NEXT:  .LBB2_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB2_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1132-NEXT:  .LBB2_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB2_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB2_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX7LESS-DPP-NEXT:  .LBB2_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1330,55 +1211,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1064-DPP-NEXT:  .LBB2_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB2_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1032-DPP-NEXT:  .LBB2_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB2_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
@@ -1388,60 +1243,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1164-DPP-NEXT:  .LBB2_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB2_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX1132-DPP-NEXT:  .LBB2_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB2_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
   ret void
@@ -1449,81 +1277,81 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 
 
 define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_mov_b32 s39, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s36, s36, s9
-; GFX7LESS-NEXT:    s_addc_u32 s37, s37, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s6
-; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX7LESS-NEXT:  .LBB3_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
-; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f32_e32 v2, v1, v2
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB3_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB3_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7LESS-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX7LESS-NEXT:  .LBB3_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB3_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB3_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:  .LBB3_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB3_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
 ; GFX9:       ; %bb.0:
@@ -1623,18 +1451,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1642,27 +1470,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1064-NEXT:  .LBB3_5:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB3_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -1693,45 +1507,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1032-NEXT:  .LBB3_5:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB3_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -1752,13 +1552,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -1766,7 +1566,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1775,29 +1575,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1164-NEXT:  .LBB3_5:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB3_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -1818,13 +1602,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1832,36 +1616,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB3_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB3_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX1132-NEXT:  .LBB3_5:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB3_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -2080,27 +1849,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1064-DPP-NEXT:  .LBB3_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB3_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -2162,29 +1917,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1032-DPP-NEXT:  .LBB3_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB3_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -2252,34 +1993,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1164-DPP-NEXT:  .LBB3_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB3_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
@@ -2335,34 +2060,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB3_2
-; GFX1132-DPP-NEXT:  .LBB3_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB3_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
@@ -2439,55 +2148,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1064-NEXT:  .LBB4_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB4_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1032-NEXT:  .LBB4_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB4_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
@@ -2497,95 +2180,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1164-NEXT:  .LBB4_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB4_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1132-NEXT:  .LBB4_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB4_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB4_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX7LESS-DPP-NEXT:  .LBB4_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2621,55 +2244,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1064-DPP-NEXT:  .LBB4_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB4_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1032-DPP-NEXT:  .LBB4_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB4_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
@@ -2679,141 +2276,114 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1164-DPP-NEXT:  .LBB4_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB4_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX1132-DPP-NEXT:  .LBB4_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB4_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
   ret void
 }
 
 define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_mov_b32 s39, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s36, s36, s9
-; GFX7LESS-NEXT:    s_addc_u32 s37, s37, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s6
-; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX7LESS-NEXT:  .LBB5_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX7LESS-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; GFX7LESS-NEXT:    v_readlane_b32 s4, v0, s2
-; GFX7LESS-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX7LESS-NEXT:    v_mul_f32_e64 v2, 1.0, s4
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f32_e32 v2, v1, v2
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB5_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB5_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[0:3], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7LESS-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
-; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX7LESS-NEXT:  .LBB5_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB5_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB5_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7LESS-ITERATIVE-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mul_f32_e32 v0, 1.0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB5_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
@@ -2913,18 +2483,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2932,27 +2502,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1064-NEXT:  .LBB5_5:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB5_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -2983,45 +2539,31 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1032-NEXT:  .LBB5_5:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB5_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3042,13 +2584,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -3056,7 +2598,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3065,29 +2607,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1164-NEXT:  .LBB5_5:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB5_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3108,13 +2634,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3122,36 +2648,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT:  .LBB5_4: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT:    s_cbranch_execnz .LBB5_4
-; GFX1132-NEXT:  .LBB5_5:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB5_4:
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3370,27 +2881,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1064-DPP-NEXT:  .LBB5_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB5_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3452,29 +2949,15 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1032-DPP-NEXT:  .LBB5_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB5_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3542,34 +3025,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1164-DPP-NEXT:  .LBB5_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB5_2:
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
@@ -3625,34 +3092,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB5_2
-; GFX1132-DPP-NEXT:  .LBB5_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB5_2:
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call float @div.float.value()
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4
@@ -3662,1027 +3113,426 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
 ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[44:45], 0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7LESS-NEXT:    v_or_b32_e32 v40, v3, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s42
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s43
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX7LESS-NEXT:  .LBB6_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_mov_b32 s40, s7
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX9-NEXT:    s_mov_b32 s41, s6
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-NEXT:  .LBB6_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-NEXT:    s_add_u32 s48, s48, s9
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT:    s_movk_i32 s32, 0x800
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s33, s8
-; GFX1064-NEXT:    s_mov_b32 s40, s7
-; GFX1064-NEXT:    s_mov_b32 s41, s6
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1064-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT:    s_add_u32 s8, s36, 44
-; GFX1064-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1064-NEXT:  .LBB6_3:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB6_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s44, 0
-; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s33, s8
-; GFX1032-NEXT:    s_mov_b32 s40, s7
-; GFX1032-NEXT:    s_mov_b32 s41, s6
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1032-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-NEXT:    s_add_u32 s8, s36, 44
-; GFX1032-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1032-NEXT:  .LBB6_3:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB6_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1164-NEXT:    s_mov_b32 s33, s8
-; GFX1164-NEXT:    s_mov_b32 s40, s7
-; GFX1164-NEXT:    s_mov_b32 s41, s6
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-NEXT:    s_add_u32 s8, s36, 44
-; GFX1164-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s41
-; GFX1164-NEXT:    s_mov_b32 s13, s40
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-NEXT:  .LBB6_3:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s44, 0
-; GFX1132-NEXT:    s_mov_b32 s32, 32
+; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s40, s14
-; GFX1132-NEXT:    s_mov_b32 s41, s13
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[36:37], s[2:3]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX1132-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-NEXT:    s_add_u32 s8, s36, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s41
-; GFX1132-NEXT:    s_mov_b32 s13, s40
-; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-NEXT:  .LBB6_3:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB6_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v3, v2
-; GFX7LESS-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX7LESS-DPP-NEXT:  .LBB6_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX9-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX9-DPP-NEXT:    s_mov_b32 s40, s7
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX9-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1064-DPP-NEXT:  .LBB6_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB6_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX1032-DPP-NEXT:  .LBB6_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB6_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT:    .p2align 6
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1164-DPP-NEXT:  .LBB6_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB6_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT:    .p2align 6
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX1132-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB6_2
 ; GFX1132-DPP-NEXT:  .LBB6_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8
   ret void
 }
 
 define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX7LESS-NEXT:  .LBB7_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
-; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB7_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[42:43], 0
-; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX7LESS-NEXT:  .LBB7_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], v[41:42]
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB7_4
-; GFX7LESS-NEXT:  .LBB7_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB7_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB7_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:  .LBB7_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB7_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s40, s7
-; GFX9-NEXT:    s_mov_b32 s41, s6
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX9-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4692,103 +3542,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[42:43], s[36:37], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[42:43]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
 ; GFX9-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX9-NEXT:  .LBB7_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s9
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s33, s8
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_mov_b32 s40, s7
-; GFX1064-NEXT:    s_mov_b32 s41, s6
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_movk_i32 s32, 0x800
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4796,92 +3613,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB7_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1064-NEXT:  .LBB7_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB7_4
-; GFX1064-NEXT:  .LBB7_5:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT:  .LBB7_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s9
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s33, s8
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_mov_b32 s40, s7
-; GFX1032-NEXT:    s_mov_b32 s41, s6
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
@@ -4899,93 +3667,44 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB7_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[42:43]
-; GFX1032-NEXT:  .LBB7_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-NEXT:    s_cbranch_execnz .LBB7_4
-; GFX1032-NEXT:  .LBB7_5:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT:  .LBB7_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT:    s_mov_b32 s33, s8
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s12, s6
 ; GFX1164-NEXT:    s_mov_b32 s13, s7
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    s_mov_b32 s32, 32
-; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
-; GFX1164-NEXT:    s_mov_b32 s40, s7
-; GFX1164-NEXT:    s_mov_b32 s41, s6
+; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1164-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1164-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -4994,7 +3713,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5005,87 +3724,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[42:43]
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
+; GFX1164-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1164-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s41
-; GFX1164-NEXT:    s_mov_b32 s13, s40
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[4:5], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1164-NEXT:  .LBB7_5:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT:    s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT:    s_mov_b32 s40, s14
-; GFX1132-NEXT:    s_mov_b32 s41, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
-; GFX1132-NEXT:    s_mov_b32 s32, 32
-; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
@@ -5094,837 +3782,592 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s44, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[42:43]
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
+; GFX1132-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1132-NEXT:  .LBB7_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1132-NEXT:    s_add_u32 s8, s34, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s41
-; GFX1132-NEXT:    s_mov_b32 s13, s40
-; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[4:5], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX1132-NEXT:  .LBB7_5:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[42:43], 0
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[42:43]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX7LESS-DPP-NEXT:    s_endpgm
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX9-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s43, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s42, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s2, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[11:12], v0, s[2:3]
 ; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[42:43], s[42:43]
+; GFX9-DPP-NEXT:    v_max_f64 v[1:2], s[0:1], s[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT:    v_min_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT:    global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s5, v9, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s4, v8, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], s[4:5], s[4:5]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], s[2:3], s[2:3]
-; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1064-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB7_2
-; GFX1064-DPP-NEXT:  .LBB7_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB7_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1032-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB7_2
-; GFX1032-DPP-NEXT:  .LBB7_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB7_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v10, v8
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[42:43]
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT:    .p2align 6
+; GFX1164-DPP-NEXT:    global_load_b64 v[10:11], v12, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1164-DPP-NEXT:  .LBB7_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
+; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[42:43]
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT:    .p2align 6
+; GFX1132-DPP-NEXT:    global_load_b64 v[10:11], v12, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX1132-DPP-NEXT:  .LBB7_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call double @div.double.value()
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
   ret void
 }
 
@@ -6003,59 +4446,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1064-NEXT:  .LBB8_3:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB8_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1032-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1032-NEXT:  .LBB8_3:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB8_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
@@ -6123,43 +4538,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1132-NEXT:  .LBB8_3:
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB8_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v3
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v2
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v4
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v5
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX7LESS-DPP-NEXT:  .LBB8_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6197,59 +4575,31 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1064-DPP-NEXT:  .LBB8_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB8_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s2
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s3
-; GFX1032-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX1032-DPP-NEXT:  .LBB8_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB8_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
@@ -6321,86 +4671,86 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 }
 
 define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_mov_b32 s32, 0
-; GFX7LESS-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_mov_b32 s39, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s36, s36, s9
-; GFX7LESS-NEXT:    s_addc_u32 s37, s37, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s8, s34, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s35, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
-; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s6
-; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
-; GFX7LESS-NEXT:  .LBB9_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
-; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
-; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB9_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX7LESS-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v0
-; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v7
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX7LESS-NEXT:  .LBB9_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB9_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB9_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:  .LBB9_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB9_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB9_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
 ; GFX9:       ; %bb.0:
@@ -6503,20 +4853,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6524,28 +4874,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB9_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX1064-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX1064-NEXT:  .LBB9_5:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT:  .LBB9_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -6576,48 +4911,33 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1032-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1032-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB9_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX1032-NEXT:  .LBB9_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1032-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_execnz .LBB9_4
-; GFX1032-NEXT:  .LBB9_5:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT:  .LBB9_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7008,28 +5328,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[11:12], v2, s[0:1]
-; GFX1064-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[9:10], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[13:14], v[11:12], v[11:12]
-; GFX1064-DPP-NEXT:    v_min_f64 v[9:10], v[13:14], v[9:10]
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v10
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB9_2
-; GFX1064-DPP-NEXT:  .LBB9_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB9_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7104,30 +5409,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
-; GFX1032-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[11:12], v2, s[0:1]
-; GFX1032-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[9:10], v[11:12], v[11:12]
-; GFX1032-DPP-NEXT:    v_min_f64 v[9:10], v[9:10], v[0:1]
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v10
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB9_2
-; GFX1032-DPP-NEXT:  .LBB9_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB9_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
@@ -7345,1027 +5635,426 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
 ; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x9
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX7LESS-NEXT:    s_mov_b64 s[44:45], 0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v3, v0, v1
+; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7LESS-NEXT:    v_or_b32_e32 v40, v3, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s42
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s43
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT:    v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT:    v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX7LESS-NEXT:  .LBB10_3:
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_mov_b32 s40, s7
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX9-NEXT:    s_mov_b32 s41, s6
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_or3_b32 v40, v0, v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-NEXT:  .LBB10_3:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-NEXT:    s_add_u32 s48, s48, s9
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT:    s_movk_i32 s32, 0x800
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1064-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s33, s8
-; GFX1064-NEXT:    s_mov_b32 s40, s7
-; GFX1064-NEXT:    s_mov_b32 s41, s6
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1064-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT:    s_add_u32 s8, s36, 44
-; GFX1064-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1064-NEXT:  .LBB10_3:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT:  .LBB10_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT:    s_mov_b32 s44, 0
-; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1032-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s33, s8
-; GFX1032-NEXT:    s_mov_b32 s40, s7
-; GFX1032-NEXT:    s_mov_b32 s41, s6
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1032-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-NEXT:    s_add_u32 s8, s36, 44
-; GFX1032-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1032-NEXT:  .LBB10_3:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT:  .LBB10_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1164-NEXT:    s_mov_b32 s33, s8
-; GFX1164-NEXT:    s_mov_b32 s40, s7
-; GFX1164-NEXT:    s_mov_b32 s41, s6
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-NEXT:    s_add_u32 s8, s36, 44
-; GFX1164-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-NEXT:    s_mov_b32 s12, s41
-; GFX1164-NEXT:    s_mov_b32 s13, s40
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-NEXT:  .LBB10_3:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1132-NEXT:    s_mov_b32 s44, 0
-; GFX1132-NEXT:    s_mov_b32 s32, 32
+; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    s_mov_b32 s40, s14
-; GFX1132-NEXT:    s_mov_b32 s41, s13
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-NEXT:    s_mov_b64 s[36:37], s[2:3]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
+; GFX1132-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX1132-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-NEXT:    s_add_u32 s8, s36, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-NEXT:    s_mov_b32 s12, s41
-; GFX1132-NEXT:    s_mov_b32 s13, s40
-; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1132-NEXT:  .LBB10_3:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v3, exec_hi, v3
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB10_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v3, v0, v1
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v3, v2
-; GFX7LESS-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[2:3], 4.0
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX7LESS-DPP-NEXT:  .LBB10_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX9-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX9-DPP-NEXT:    s_mov_b32 s40, s7
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1064-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1064-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1064-DPP-NEXT:  .LBB10_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB10_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
-; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v4, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1032-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v4, v3
-; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[42:43], 0x0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1032-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1032-DPP-NEXT:  .LBB10_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB10_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX1164-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT:    .p2align 6
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB10_2
 ; GFX1164-DPP-NEXT:  .LBB10_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[42:43], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[42:43], 0x0
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT:    .p2align 6
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX1132-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], 4.0
-; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX1132-DPP-NEXT:  .LBB10_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX1132-DPP-NEXT:    s_endpgm
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
-  ret void
-}
-
-define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
-; GFX7LESS:       ; %bb.0:
-; GFX7LESS-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], exec
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
-; GFX7LESS-NEXT:  .LBB11_1: ; %ComputeLoop
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7LESS-NEXT:    v_readlane_b32 s3, v1, s4
-; GFX7LESS-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX7LESS-NEXT:    s_lshl_b64 s[4:5], 1, s4
-; GFX7LESS-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX7LESS-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX7LESS-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
-; GFX7LESS-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; GFX7LESS-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
-; GFX7LESS-NEXT:    s_cbranch_vccnz .LBB11_1
-; GFX7LESS-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX7LESS-NEXT:    s_cbranch_execz .LBB11_5
-; GFX7LESS-NEXT:  ; %bb.3:
-; GFX7LESS-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s46, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_load_dwordx2 v[0:1], off, s[44:47], 0
-; GFX7LESS-NEXT:    s_mov_b64 s[42:43], 0
-; GFX7LESS-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX7LESS-NEXT:  .LBB11_4: ; %atomicrmw.start
-; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT:    v_max_f64 v[2:3], v[0:1], v[0:1]
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_min_f64 v[0:1], v[2:3], v[41:42]
-; GFX7LESS-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s45
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX7LESS-NEXT:    buffer_load_dword v0, off, s[48:51], 0
-; GFX7LESS-NEXT:    buffer_load_dword v1, off, s[48:51], 0 offset:4
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX7LESS-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[42:43]
-; GFX7LESS-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX7LESS-NEXT:  .LBB11_5:
-; GFX7LESS-NEXT:    s_endpgm
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB10_2
+; GFX1132-DPP-NEXT:  .LBB10_3:
+; GFX1132-DPP-NEXT:    s_endpgm
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 8
+  ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-ITERATIVE-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX7LESS-ITERATIVE:       ; %bb.0:
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s39, 0xe8f000
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s36, s36, s9
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s37, s37, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s8, s34, 44
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s9, s35, 0
+; GFX7LESS-ITERATIVE-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-ITERATIVE-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX7LESS-ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX7LESS-ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_vccnz .LBB11_1
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execz .LBB11_5
+; GFX7LESS-ITERATIVE-NEXT:  ; %bb.3:
+; GFX7LESS-ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x9
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; GFX7LESS-ITERATIVE-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:  .LBB11_4: ; %atomicrmw.start
+; GFX7LESS-ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-ITERATIVE-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-ITERATIVE-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-ITERATIVE-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-ITERATIVE-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-ITERATIVE-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-ITERATIVE-NEXT:    s_cbranch_execnz .LBB11_4
+; GFX7LESS-ITERATIVE-NEXT:  .LBB11_5:
+; GFX7LESS-ITERATIVE-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_mov_b32 s50, -1
-; GFX9-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-NEXT:    s_mov_b32 s33, s8
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_mov_b32 s40, s7
-; GFX9-NEXT:    s_mov_b32 s41, s6
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
+; GFX9-NEXT:    s_add_u32 s8, s34, 44
+; GFX9-NEXT:    s_addc_u32 s9, s35, 0
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX9-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s4, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX9-NEXT:    v_readlane_b32 s2, v0, s4
-; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8375,103 +6064,70 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_load_dwordx2 s[42:43], s[36:37], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[44:45], 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v0, s[42:43]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
 ; GFX9-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX9-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-NEXT:    s_mov_b32 s12, s41
-; GFX9-NEXT:    s_mov_b32 s13, s40
-; GFX9-NEXT:    s_mov_b32 s14, s33
-; GFX9-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s42
-; GFX9-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s43
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX9-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX9-NEXT:  .LBB11_5:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-NEXT:    s_mov_b32 s50, -1
-; GFX1064-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s48, s48, s9
+; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT:    s_mov_b32 s38, -1
+; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1064-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-NEXT:    s_mov_b32 s33, s8
+; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1064-NEXT:    s_mov_b32 s40, s7
-; GFX1064-NEXT:    s_mov_b32 s41, s6
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    s_movk_i32 s32, 0x800
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
+; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s4, s[0:1]
-; GFX1064-NEXT:    v_max_f64 v[2:3], v[3:4], v[3:4]
+; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1064-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[2:3], v[4:5]
+; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8479,92 +6135,43 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT:    s_cbranch_execz .LBB11_5
+; GFX1064-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1064-NEXT:  ; %bb.3:
-; GFX1064-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1064-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1064-NEXT:  .LBB11_4: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-NEXT:    s_mov_b32 s12, s41
-; GFX1064-NEXT:    s_mov_b32 s13, s40
-; GFX1064-NEXT:    s_mov_b32 s14, s33
-; GFX1064-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1064-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX1064-NEXT:  .LBB11_5:
+; GFX1064-NEXT:    global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1064-NEXT:  .LBB11_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-NEXT:    s_mov_b32 s50, -1
-; GFX1032-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s48, s48, s9
+; GFX1032-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT:    s_mov_b32 s38, -1
+; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1032-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-NEXT:    s_mov_b32 s33, s8
+; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1032-NEXT:    s_mov_b32 s40, s7
-; GFX1032-NEXT:    s_mov_b32 s41, s6
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
+; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1032-NEXT:    s_mov_b32 s0, exec_lo
@@ -8582,93 +6189,44 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s44, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT:    s_cbranch_execz .LBB11_5
+; GFX1032-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1032-NEXT:  ; %bb.3:
-; GFX1032-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
+; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    global_load_dwordx2 v[4:5], v0, s[42:43]
-; GFX1032-NEXT:  .LBB11_4: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-NEXT:    buffer_store_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    buffer_store_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-NEXT:    s_mov_b32 s12, s41
-; GFX1032-NEXT:    s_mov_b32 s13, s40
-; GFX1032-NEXT:    s_mov_b32 s14, s33
-; GFX1032-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1032-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX1032-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX1032-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    buffer_load_dword v4, off, s[48:51], 0
-; GFX1032-NEXT:    buffer_load_dword v5, off, s[48:51], 0 offset:4
-; GFX1032-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX1032-NEXT:  .LBB11_5:
+; GFX1032-NEXT:    global_atomic_fmin_x2 v0, v[2:3], s[0:1]
+; GFX1032-NEXT:  .LBB11_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1164:       ; %bb.0:
 ; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1164-NEXT:    s_mov_b32 s33, s8
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-NEXT:    s_mov_b32 s12, s6
 ; GFX1164-NEXT:    s_mov_b32 s13, s7
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    s_mov_b32 s32, 32
-; GFX1164-NEXT:    v_mov_b32_e32 v40, v0
-; GFX1164-NEXT:    s_mov_b32 s40, s7
-; GFX1164-NEXT:    s_mov_b32 s41, s6
+; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s4, s[0:1]
-; GFX1164-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1164-NEXT:    v_readlane_b32 s3, v1, s4
 ; GFX1164-NEXT:    v_readlane_b32 s2, v0, s4
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -8677,7 +6235,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8688,87 +6246,56 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1164-NEXT:  ; %bb.3:
-; GFX1164-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
-; GFX1164-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
+; GFX1164-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    global_load_b64 v[4:5], v0, s[42:43]
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
+; GFX1164-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1164-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-NEXT:    s_mov_b32 s12, s41
-; GFX1164-NEXT:    s_mov_b32 s13, s40
-; GFX1164-NEXT:    s_mov_b32 s14, s33
-; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    scratch_store_b64 off, v[4:5], off
-; GFX1164-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-NEXT:    scratch_load_b64 v[4:5], off, off
-; GFX1164-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1164-NEXT:  .LBB11_5:
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1132-NEXT:    s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1132-NEXT:    s_mov_b32 s40, s14
-; GFX1132-NEXT:    s_mov_b32 s41, s13
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
-; GFX1132-NEXT:    s_mov_b32 s32, 32
-; GFX1132-NEXT:    s_mov_b32 s33, s15
-; GFX1132-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[2:3], v[4:5], v[4:5]
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
@@ -8777,837 +6304,592 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s44, 0
+; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB11_5
 ; GFX1132-NEXT:  ; %bb.3:
-; GFX1132-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-NEXT:    v_max_f64 v[41:42], v[2:3], v[2:3]
+; GFX1132-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    global_load_b64 v[4:5], v0, s[42:43]
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
+; GFX1132-NEXT:    global_load_b64 v[2:3], v6, s[0:1]
 ; GFX1132-NEXT:  .LBB11_4: ; %atomicrmw.start
 ; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
-; GFX1132-NEXT:    s_add_u32 s8, s34, 44
-; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1132-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-NEXT:    s_mov_b32 s12, s41
-; GFX1132-NEXT:    s_mov_b32 s13, s40
-; GFX1132-NEXT:    s_mov_b32 s14, s33
-; GFX1132-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1132-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    scratch_store_b64 off, v[4:5], off
-; GFX1132-NEXT:    scratch_store_b64 off, v[0:1], off offset:8
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-NEXT:    scratch_load_b64 v[4:5], off, off
-; GFX1132-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT:    global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
 ; GFX1132-NEXT:    s_cbranch_execnz .LBB11_4
 ; GFX1132-NEXT:  .LBB11_5:
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-NEXT:    s_endpgm
 ;
 ; GFX7LESS-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    s_movk_i32 s32, 0x800
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s51, 0xe8f000
-; GFX7LESS-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s47, 0xf000
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s46, -1
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s42, -1
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX7LESS-DPP-NEXT:    s_add_u32 s40, s40, s9
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s41, s41, 0
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX7LESS-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v40, v0, v2
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-DPP-NEXT:    v_or_b32_e32 v31, v0, v2
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[44:47], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[42:43], 0
-; GFX7LESS-DPP-NEXT:    v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-DPP-NEXT:    buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7LESS-DPP-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX7LESS-DPP-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-DPP-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[41:42]
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX7LESS-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX7LESS-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX7LESS-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0 offset:12
-; GFX7LESS-DPP-NEXT:    buffer_store_dword v0, off, s[48:51], 0 offset:8
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-DPP-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, s45
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX7LESS-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0
-; GFX7LESS-DPP-NEXT:    buffer_load_dword v3, off, s[48:51], 0 offset:4
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[42:43], vcc, s[42:43]
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX7LESS-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-DPP-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v6
+; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v7
+; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7LESS-DPP-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7LESS-DPP-NEXT:    s_endpgm
 ;
 ; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX9-DPP:       ; %bb.0:
-; GFX9-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX9-DPP-NEXT:    s_mov_b32 s51, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s48, s48, s9
-; GFX9-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[36:37], s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s33, s8
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX9-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
+; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-DPP-NEXT:    s_nop 0
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX9-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
 ; GFX9-DPP-NEXT:    s_nop 1
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX9-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT:    v_readlane_b32 s43, v9, 63
-; GFX9-DPP-NEXT:    v_readlane_b32 s42, v8, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX9-DPP-NEXT:    v_readlane_b32 s2, v3, 63
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
 ; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[44:45], s[36:37], 0x24
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[46:47], 0
+; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[44:45]
+; GFX9-DPP-NEXT:    global_load_dwordx2 v[11:12], v0, s[2:3]
 ; GFX9-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_max_f64 v[3:4], s[42:43], s[42:43]
+; GFX9-DPP-NEXT:    v_max_f64 v[1:2], s[0:1], s[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
-; GFX9-DPP-NEXT:    s_add_u32 s8, s36, 44
-; GFX9-DPP-NEXT:    s_addc_u32 s9, s37, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX9-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX9-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX9-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX9-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX9-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s44
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, s45
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX9-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX9-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX9-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-DPP-NEXT:    s_or_b64 s[46:47], vcc, s[46:47]
-; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[46:47]
+; GFX9-DPP-NEXT:    v_max_f64 v[9:10], v[11:12], v[11:12]
+; GFX9-DPP-NEXT:    v_min_f64 v[9:10], v[9:10], v[1:2]
+; GFX9-DPP-NEXT:    global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
+; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX9-DPP-NEXT:  .LBB11_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1064-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1064-DPP-NEXT:    s_mov_b32 s51, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1064-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1064-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1064-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1064-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
-; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v9, 0
-; GFX1064-DPP-NEXT:    v_readlane_b32 s5, v9, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s4, v8, 32
-; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v8, 0
-; GFX1064-DPP-NEXT:    v_max_f64 v[8:9], s[4:5], s[4:5]
-; GFX1064-DPP-NEXT:    v_max_f64 v[10:11], s[2:3], s[2:3]
-; GFX1064-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1064-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1064-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
+; GFX1064-DPP-NEXT:    v_readlane_b32 s3, v4, 0
+; GFX1064-DPP-NEXT:    v_readlane_b32 s5, v4, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s4, v3, 32
+; GFX1064-DPP-NEXT:    v_readlane_b32 s2, v3, 0
+; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], s[4:5], s[4:5]
+; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], s[2:3], s[2:3]
+; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
 ; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v41, v8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v42, v9
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB11_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[44:45], 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1064-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f64 v[3:4], v[41:42], v[41:42]
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_max_f64 v[5:6], v[1:2], v[1:2]
-; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1064-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1064-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1064-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1064-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1064-DPP-NEXT:    s_clause 0x1
-; GFX1064-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1064-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1064-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[44:45]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB11_2
-; GFX1064-DPP-NEXT:  .LBB11_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT:  .LBB11_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1032-DPP:       ; %bb.0:
-; GFX1032-DPP-NEXT:    s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT:    s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX1032-DPP-NEXT:    s_mov_b32 s50, -1
-; GFX1032-DPP-NEXT:    s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1032-DPP-NEXT:    s_addc_u32 s49, s49, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1032-DPP-NEXT:    s_mov_b32 s41, s6
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    v_or3_b32 v40, v0, v1, v2
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v13, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1032-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1032-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1032-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1032-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1032-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v8, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[7:8], v[7:8], v[7:8]
+; GFX1032-DPP-NEXT:    v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[3:4], v[3:4]
+; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v6, v6, -1, -1
+; GFX1032-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1032-DPP-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[5:6]
 ; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1032-DPP-NEXT:    s_mov_b32 s44, 0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v4
+; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[42:43], s[34:35], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    global_load_dwordx2 v[1:2], v0, s[42:43]
-; GFX1032-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1032-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1032-DPP-NEXT:    buffer_store_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    buffer_store_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[48:49]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[50:51]
-; GFX1032-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1032-DPP-NEXT:    buffer_store_dword v4, off, s[48:51], 0 offset:12
-; GFX1032-DPP-NEXT:    buffer_store_dword v3, off, s[48:51], 0 offset:8
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GFX1032-DPP-NEXT:    s_clause 0x1
-; GFX1032-DPP-NEXT:    buffer_load_dword v1, off, s[48:51], 0
-; GFX1032-DPP-NEXT:    buffer_load_dword v2, off, s[48:51], 0 offset:4
-; GFX1032-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s44
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB11_2
-; GFX1032-DPP-NEXT:  .LBB11_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT:  .LBB11_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1164-DPP-NEXT:    s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
 ; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    s_mov_b32 s32, 32
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v40, v0
-; GFX1164-DPP-NEXT:    s_mov_b32 s40, s7
-; GFX1164-DPP-NEXT:    s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v13, v9
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1164-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX1164-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1164-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1164-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
-; GFX1164-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
+; GFX1164-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v11, v9
-; GFX1164-DPP-NEXT:    v_permlane64_b32 v10, v8
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v5, v3
+; GFX1164-DPP-NEXT:    v_permlane64_b32 v4, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1164-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1164-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    global_load_b64 v[1:2], v0, s[42:43]
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-DPP-NEXT:    .p2align 6
+; GFX1164-DPP-NEXT:    global_load_b64 v[10:11], v12, s[0:1]
 ; GFX1164-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1164-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v40
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v5, 8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v6, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1164-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1164-DPP-NEXT:    s_clause 0x1
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1164-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, s42
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, s43
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1164-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1164-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[44:45], vcc, s[44:45]
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v10, v8
+; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
 ; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1164-DPP-NEXT:  .LBB11_3:
-; GFX1164-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, div.double.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, div.double.value at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GFX1132-DPP-NEXT:    s_mov_b32 s40, s14
-; GFX1132-DPP-NEXT:    s_mov_b32 s41, s13
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.double.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.double.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
-; GFX1132-DPP-NEXT:    s_mov_b32 s32, 32
-; GFX1132-DPP-NEXT:    s_mov_b32 s33, s15
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v9, 0x7ff80000
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v1
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v11, v9
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v2
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX1132-DPP-NEXT:    v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX1132-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v11, v11, -1, -1
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v5, v5, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v10, v10, -1, -1
-; GFX1132-DPP-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v4, v4, -1, -1
+; GFX1132-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[10:11]
+; GFX1132-DPP-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, v8
-; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v9
-; GFX1132-DPP-NEXT:    s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v3
+; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v8
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_load_b64 s[42:43], s[34:35], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT:    v_max_f64 v[41:42], v[3:4], v[3:4]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1132-DPP-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    global_load_b64 v[1:2], v0, s[42:43]
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-DPP-NEXT:    .p2align 6
+; GFX1132-DPP-NEXT:    global_load_b64 v[10:11], v12, s[0:1]
 ; GFX1132-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
 ; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_max_f64 v[3:4], v[1:2], v[1:2]
-; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[0:1]
-; GFX1132-DPP-NEXT:    s_add_u32 s0, s0, __atomic_compare_exchange at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s1, s1, __atomic_compare_exchange at gotpcrel32@hi+12
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v7, 0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[38:39]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; GFX1132-DPP-NEXT:    s_mov_b32 s12, s41
-; GFX1132-DPP-NEXT:    s_mov_b32 s13, s40
-; GFX1132-DPP-NEXT:    s_mov_b32 s14, s33
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX1132-DPP-NEXT:    v_min_f64 v[3:4], v[3:4], v[41:42]
-; GFX1132-DPP-NEXT:    s_clause 0x1
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[1:2], off
-; GFX1132-DPP-NEXT:    scratch_store_b64 off, v[3:4], off offset:8
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
-; GFX1132-DPP-NEXT:    v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX1132-DPP-NEXT:    scratch_load_b64 v[1:2], off, off
-; GFX1132-DPP-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s44, vcc_lo, s44
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f64 v[8:9], v[8:9], v[0:1]
+; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
+; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1132-DPP-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
 ; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB11_2
 ; GFX1132-DPP-NEXT:  .LBB11_3:
-; GFX1132-DPP-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX1132-DPP-NEXT:    s_endpgm
   %divValue = call double @div.double.value()
-  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+  %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8
   ret void
 }
 
@@ -9680,55 +6962,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1064-NEXT:  .LBB12_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB12_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1032-NEXT:  .LBB12_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB12_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9738,95 +6994,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1164-NEXT:  .LBB12_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB12_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1132-NEXT:  .LBB12_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB12_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB12_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX7LESS-DPP-NEXT:  .LBB12_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9862,55 +7058,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1064-DPP-NEXT:  .LBB12_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB12_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1032-DPP-NEXT:  .LBB12_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB12_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -9920,60 +7090,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1164-DPP-NEXT:  .LBB12_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB12_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX1132-DPP-NEXT:  .LBB12_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB12_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
   ret void
@@ -10048,55 +7191,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1064-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1064-NEXT:  .LBB13_3:
+; GFX1064-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT:  .LBB13_2:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032:       ; %bb.0:
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1032-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1032-NEXT:  .LBB13_3:
+; GFX1032-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT:  .LBB13_2:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10106,95 +7223,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1164-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1164-NEXT:  ; %bb.1:
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-NEXT:  .LBB13_3:
+; GFX1164-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT:  .LBB13_2:
+; GFX1164-NEXT:    s_nop 0
+; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132:       ; %bb.0:
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1132-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1132-NEXT:  ; %bb.1:
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-NEXT:  .LBB13_3:
+; GFX1132-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT:  .LBB13_2:
+; GFX1132-NEXT:    s_nop 0
+; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
 ;
-; GFX7LESS-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
-; GFX7LESS-DPP:       ; %bb.0:
-; GFX7LESS-DPP-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-DPP-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX7LESS-DPP-NEXT:    s_cbranch_execz .LBB13_3
-; GFX7LESS-DPP-NEXT:  ; %bb.1:
-; GFX7LESS-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-DPP-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7LESS-DPP-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX7LESS-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-DPP-NEXT:    v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-DPP-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7LESS-DPP-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX7LESS-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-DPP-NEXT:    v_mov_b32_e32 v1, v2
-; GFX7LESS-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX7LESS-DPP-NEXT:  .LBB13_3:
-; GFX7LESS-DPP-NEXT:    s_endpgm
-;
 ; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX9-DPP:       ; %bb.0:
 ; GFX9-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10230,55 +7287,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1064-DPP-NEXT:  ; %bb.1:
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1064-DPP-NEXT:  .LBB13_3:
+; GFX1064-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT:  .LBB13_2:
 ; GFX1064-DPP-NEXT:    s_endpgm
 ;
 ; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1032-DPP:       ; %bb.0:
 ; GFX1032-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-DPP-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1032-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1032-DPP-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
-; GFX1032-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1032-DPP-NEXT:  .LBB13_3:
+; GFX1032-DPP-NEXT:    global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT:  .LBB13_2:
 ; GFX1032-DPP-NEXT:    s_endpgm
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -10288,60 +7319,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1164-DPP-NEXT:  ; %bb.1:
 ; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-DPP-NEXT:  .LBB13_3:
+; GFX1164-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT:  .LBB13_2:
+; GFX1164-DPP-NEXT:    s_nop 0
+; GFX1164-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-DPP-NEXT:    s_endpgm
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1132-DPP:       ; %bb.0:
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
 ; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX1132-DPP-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT:    global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX1132-DPP-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-DPP-NEXT:  .LBB13_3:
+; GFX1132-DPP-NEXT:    global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT:  .LBB13_2:
+; GFX1132-DPP-NEXT:    s_nop 0
+; GFX1132-DPP-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-DPP-NEXT:    s_endpgm
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0  monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
   ret void
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index 7290a91c9ccd2..b0d2824a64ee3 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -3345,176 +3345,936 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
 ;---------------------------------------------------------------------
 
 define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
@@ -3525,176 +4285,936 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
 ;---------------------------------------------------------------------
 
 define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
-  ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
-  ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index 05fb224e6f145..c5bf26dc4c0e1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -3367,66 +3367,339 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3455,66 +3728,339 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
@@ -3542,71 +4088,344 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float
 ; COMMON:       atomicrmw.end:
 ; COMMON-NEXT:    ret float [[TMP6]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
-  ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
 ;
-  %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
-  ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret float %res
@@ -3635,66 +4454,339 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
 }
 
 define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
 }
 
 define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX940:       atomicrmw.start:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940:       atomicrmw.end:
+; GFX940-NEXT:    ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT:    ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
index af6b7e0addfb1..ee360dee79425 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -1241,88 +1241,468 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
 ;---------------------------------------------------------------------
 
 define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP6]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP6]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1457,88 +1837,468 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
 ;---------------------------------------------------------------------
 
 define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[TMP6]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[TMP6]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index 69d65e6f1f379..ac5dd55002f3f 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -1263,66 +1263,339 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub
 }
 
 define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res
@@ -1479,66 +1752,339 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub
 }
 
 define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX90A:       atomicrmw.start:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A:       atomicrmw.end:
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX10:       atomicrmw.start:
+; GFX10-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX10-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10:       atomicrmw.end:
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
   ret double %res
 }
 
 define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; COMMON:       atomicrmw.start:
-; COMMON-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON:       atomicrmw.end:
-; COMMON-NEXT:    ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX803:       atomicrmw.start:
+; GFX803-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803:       atomicrmw.end:
+; GFX803-NEXT:    ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX906:       atomicrmw.start:
+; GFX906-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906:       atomicrmw.end:
+; GFX906-NEXT:    ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX908:       atomicrmw.start:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908:       atomicrmw.end:
+; GFX908-NEXT:    ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT:    ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX11:       atomicrmw.start:
+; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11:       atomicrmw.end:
+; GFX11-NEXT:    ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GFX12:       atomicrmw.start:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12:       atomicrmw.end:
+; GFX12-NEXT:    ret double [[TMP6]]
 ;
   %res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
   ret double %res



More information about the cfe-commits mailing list