[llvm-branch-commits] [llvm] AMDGPU: Handle remote/fine-grained memory in atomicrmw fmin/fmax lowering (PR #96759)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 26 05:40:03 PDT 2024
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/96759
Consider the new atomic metadata when choosing to expand as cmpxchg
instead.
>From 50d27a497441d45ae927e1ae2d77e560bc103365 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 23 Jun 2024 11:16:22 +0200
Subject: [PATCH] AMDGPU: Handle remote/fine-grained memory in atomicrmw
fmin/fmax lowering
Consider the new atomic metadata when choosing to expand as cmpxchg
instead.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 83 +-
.../buffer-fat-pointer-atomicrmw-fmax.ll | 333 ++-
.../buffer-fat-pointer-atomicrmw-fmin.ll | 333 ++-
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 446 +--
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 446 +--
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 579 ++--
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 579 ++--
.../AMDGPU/global_atomics_scan_fmax.ll | 2400 +++++------------
.../AMDGPU/global_atomics_scan_fmin.ll | 2400 +++++------------
.../AMDGPU/expand-atomic-f32-agent.ll | 2052 ++++++++++++--
.../AMDGPU/expand-atomic-f32-system.ll | 1496 ++++++++--
.../AMDGPU/expand-atomic-f64-agent.ll | 1016 ++++++-
.../AMDGPU/expand-atomic-f64-system.ll | 738 ++++-
13 files changed, 7015 insertions(+), 5886 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fc34277c580a8..11ebfe7511f7b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16093,6 +16093,34 @@ static bool isBFloat2(Type *Ty) {
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
}
+/// \returns true if it's valid to emit a native instruction for \p RMW, based
+/// on the properties of the target memory.
+static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
+ const AtomicRMWInst *RMW,
+ bool HasSystemScope) {
+ // The remote/fine-grained access logic is different from the integer
+ // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
+ // fine-grained access does not work, even for a device local allocation.
+ //
+ // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
+ // allocations work.
+ if (HasSystemScope) {
+ if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
+ RMW->hasMetadata("amdgpu.no.remote.memory"))
+ return true;
+ } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
+ return true;
+
+ if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
+ return true;
+
+ // TODO: Auto-upgrade this attribute to the metadata in function body and stop
+ // checking it.
+ return RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsBool();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
@@ -16236,37 +16264,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
Type *Ty = RMW->getType();
// LDS float and double fmin/fmax were always supported.
- if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
- return AtomicExpansionKind::None;
-
- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
- return AtomicExpansionKind::CmpXChg;
-
- // Always expand system scope fp atomics.
- if (HasSystemScope)
- return AtomicExpansionKind::CmpXChg;
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
+ }
- // For flat and global cases:
- // float, double in gfx7. Manual claims denormal support.
- // Removed in gfx8.
- // float, double restored in gfx10.
- // double removed again in gfx11, so only f32 for gfx11/gfx12.
- //
- // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
- // f32.
- //
- // FIXME: Check scope and fine grained memory
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
- if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
- AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
+ // For flat and global cases:
+ // float, double in gfx7. Manual claims denormal support.
+ // Removed in gfx8.
+ // float, double restored in gfx10.
+ // double removed again in gfx11, so only f32 for gfx11/gfx12.
+ //
+ // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
+ // no f32.
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
}
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 06dee9c279f2c..2a15cdaede44e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -796,23 +796,64 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB3_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
@@ -904,19 +945,60 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, s8
-; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX7-NEXT: v_mov_b32_e32 v1, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, s10
+; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB3_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, s8
-; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX6-NEXT: v_mov_b32_e32 v1, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
+; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT: v_max_f32_e32 v4, v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB3_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
@@ -1992,21 +2074,66 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: s_addk_i32 s8, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s8
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX90A-NEXT: s_add_i32 s10, s8, 0x800
+; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: v_mov_b32_e32 v6, s10
+; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
@@ -2078,19 +2205,68 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
-; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT: s_add_i32 s10, s8, 0x800
+; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v3, v1
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT: s_add_i32 s10, s8, 0x800
+; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s10
+; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
@@ -7943,32 +8119,11 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_addk_co_i32 s4, 0x400
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v5, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_cbranch_execnz .LBB22_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
@@ -8003,64 +8158,23 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: s_addk_i32 s4, 0x400
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_execnz .LBB22_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: s_addk_i32 s8, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT: s_mov_b32 s8, 0
-; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: s_cbranch_execnz .LBB22_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
@@ -8154,60 +8268,19 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, v0
-; GFX7-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s10, s8, 0x400
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, s10
-; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB22_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, v0
-; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX6-NEXT: s_add_i32 s10, s8, 0x400
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, s10
-; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v4
-; GFX6-NEXT: v_mov_b32_e32 v1, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB22_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
@@ -8215,6 +8288,6 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
ret float %result
}
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 2791162396a91..3f6118b030f04 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -796,23 +796,64 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX11-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX10-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB3_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
@@ -904,19 +945,60 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, s8
-; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX7-NEXT: v_mov_b32_e32 v1, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, s10
+; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB3_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, s8
-; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX6-NEXT: v_mov_b32_e32 v1, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
+; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX6-NEXT: v_min_f32_e32 v4, v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB3_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
@@ -1992,21 +2074,66 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: s_addk_i32 s8, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s8
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX90A-NEXT: s_add_i32 s10, s8, 0x800
+; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX90A-NEXT: v_mov_b32_e32 v6, s10
+; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
@@ -2078,19 +2205,68 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
-; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT: s_add_i32 s10, s8, 0x800
+; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v3, v1
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT: s_add_i32 s10, s8, 0x800
+; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s10
+; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
+; GFX6-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
@@ -7943,32 +8119,11 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_addk_co_i32 s4, 0x400
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v5, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT: v_min_num_f32_e32 v4, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_cbranch_execnz .LBB22_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
@@ -8003,64 +8158,23 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: s_addk_i32 s4, 0x400
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT: v_min_f32_e32 v4, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_execnz .LBB22_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: s_addk_i32 s8, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT: s_mov_b32 s8, 0
-; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT: v_min_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: s_cbranch_execnz .LBB22_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
@@ -8154,60 +8268,19 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, v0
-; GFX7-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s10, s8, 0x400
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, s10
-; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB22_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, v0
-; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX6-NEXT: s_add_i32 s10, s8, 0x400
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, s10
-; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v4
-; GFX6-NEXT: v_mov_b32_e32 v1, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB22_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
@@ -8215,6 +8288,6 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
ret float %result
}
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index e6d6652c8b7b7..85567ccdaddc8 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -923,28 +923,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB6_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -975,56 +957,23 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB6_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB6_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1105,26 +1054,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB6_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1139,27 +1073,10 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB7_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1189,28 +1106,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB7_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1218,26 +1119,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB7_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1318,24 +1205,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1384,21 +1256,55 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
@@ -1476,9 +1382,25 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret float %result
@@ -2541,28 +2463,10 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2593,56 +2497,23 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_max_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2723,26 +2594,11 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX7-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_max_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2757,27 +2613,10 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2807,28 +2646,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX11-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2836,26 +2659,12 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmax v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2936,24 +2745,9 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmax v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -3991,19 +3785,54 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB24_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -4064,9 +3893,30 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB24_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret double %result
@@ -15149,7 +14999,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
ret void
}
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 6babcfd15ee1a..b41c4991be429 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -923,28 +923,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB6_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -975,56 +957,23 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB6_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB6_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1105,26 +1054,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB6_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1139,27 +1073,10 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB7_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1189,28 +1106,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB7_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1218,26 +1119,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB7_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1318,24 +1205,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -1384,21 +1256,55 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
@@ -1476,9 +1382,25 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret float %result
@@ -2541,28 +2463,10 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2593,56 +2497,23 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v6, v6
-; GFX10-NEXT: v_min_f32_e32 v5, v0, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2723,26 +2594,11 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
-; GFX7-NEXT: v_min_f32_e32 v5, v0, v1
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2757,27 +2613,10 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2807,28 +2646,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2836,26 +2659,12 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_atomic_fmin v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2936,24 +2745,9 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_atomic_fmin v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
%unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -3991,19 +3785,54 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB24_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -4064,9 +3893,30 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB24_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret double %result
@@ -15149,7 +14999,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
ret void
}
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index e7d62fdc00cff..89e3084e98b14 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -979,28 +979,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB6_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1031,55 +1013,21 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB6_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB6_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1164,27 +1112,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB6_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1194,28 +1125,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB6_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -1231,27 +1144,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB7_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1281,53 +1177,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB7_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB7_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1410,26 +1274,9 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1439,27 +1286,9 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB7_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -1509,21 +1338,55 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
@@ -1605,10 +1468,27 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
@@ -1618,10 +1498,28 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -2759,28 +2657,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2811,55 +2691,21 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_max_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_max_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2944,27 +2790,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2974,28 +2803,10 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB16_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -3011,27 +2822,10 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3061,53 +2855,21 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX11-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3190,26 +2952,9 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3219,27 +2964,9 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -4327,19 +4054,54 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB24_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -4398,28 +4160,69 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v11, v1
+; GFX7-NEXT: v_mov_b32_e32 v10, v0
+; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v1, v9
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
+; GFX7-NEXT: v_mov_b32_e32 v3, v11
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB24_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v11, v1
+; GFX6-NEXT: v_mov_b32_e32 v10, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX6-NEXT: v_max_f64 v[8:9], v[0:1], v[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v0, v8
+; GFX6-NEXT: v_mov_b32_e32 v1, v9
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
+; GFX6-NEXT: v_mov_b32_e32 v3, v11
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_mov_b32_e32 v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB24_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -17279,7 +17082,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi
ret void
}
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 91a8ac7c935b6..26d132f194081 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -979,28 +979,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB6_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1031,55 +1013,21 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB6_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB6_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1164,27 +1112,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB6_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1194,28 +1125,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB6_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -1231,27 +1144,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB7_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1281,53 +1177,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB7_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB7_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1410,26 +1274,9 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB7_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1439,27 +1286,9 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB7_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -1509,21 +1338,55 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
@@ -1605,10 +1468,27 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
@@ -1618,10 +1498,28 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
+; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -2759,28 +2657,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
-; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2811,55 +2691,21 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX11-NEXT: v_min_f32_e32 v3, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_max_f32_e32 v3, v4, v4
-; GFX10-NEXT: v_min_f32_e32 v3, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_fmin v0, v[0:1], v2, off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2944,27 +2790,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2974,28 +2803,10 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; GFX6-NEXT: v_min_f32_e32 v4, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB16_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -3011,27 +2822,10 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: v_max_num_f32_e32 v4, v2, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3061,53 +2855,21 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX11-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f32_e32 v2, v2, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX10-NEXT: v_max_f32_e32 v4, v2, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX10-NEXT: v_min_f32_e32 v2, v2, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_atomic_fmin v[0:1], v2, off offset:2044
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3190,26 +2952,9 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3219,27 +2964,9 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v2
-; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_fmin v2, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
@@ -4327,19 +4054,54 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX10-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB24_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3]
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -4398,28 +4160,69 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v11, v1
+; GFX7-NEXT: v_mov_b32_e32 v10, v0
+; GFX7-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v1, v9
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
+; GFX7-NEXT: v_mov_b32_e32 v3, v11
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB24_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_atomic_fmin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v11, v1
+; GFX6-NEXT: v_mov_b32_e32 v10, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11]
+; GFX6-NEXT: v_min_f64 v[8:9], v[0:1], v[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v0, v8
+; GFX6-NEXT: v_mov_b32_e32 v1, v9
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
+; GFX6-NEXT: v_mov_b32_e32 v3, v11
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_mov_b32_e32 v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB24_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
@@ -17279,7 +17082,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi
ret void
}
-attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 7f052e1ae8b55..ee92ef9766e13 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -15,8 +15,8 @@
declare float @div.float.value()
declare float @div.double.value()
-define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -49,7 +49,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX7LESS-NEXT: .LBB0_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -78,7 +78,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -94,7 +94,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -109,7 +109,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
@@ -128,7 +128,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
@@ -145,7 +145,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -174,7 +174,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -190,7 +190,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: .LBB0_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -205,7 +205,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1032-DPP-NEXT: .LBB0_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
@@ -224,7 +224,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
@@ -240,12 +240,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: s_nop 0
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -297,7 +297,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -367,7 +367,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_4:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -478,7 +478,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-NEXT: .LBB1_4:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -528,7 +528,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: .LBB1_4:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -577,7 +577,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -670,7 +670,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: .LBB1_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -748,7 +748,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: .LBB1_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -818,7 +818,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: .LBB1_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -897,7 +897,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: .LBB1_2:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -964,12 +964,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: .LBB1_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -1002,7 +1002,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX7LESS-NEXT: .LBB2_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -1031,127 +1031,74 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB2_3
+; GFX1064-NEXT: s_cbranch_execz .LBB2_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1064-NEXT: .LBB2_3:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB2_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB2_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB2_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1032-NEXT: .LBB2_3:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB2_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB2_3
+; GFX1164-NEXT: s_cbranch_execz .LBB2_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1164-NEXT: .LBB2_3:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB2_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB2_3
+; GFX1132-NEXT: s_cbranch_execz .LBB2_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1132-NEXT: .LBB2_3:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB2_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -1180,132 +1127,79 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1064-DPP-NEXT: .LBB2_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB2_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1032-DPP-NEXT: .LBB2_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB2_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1164-DPP-NEXT: .LBB2_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB2_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1132-DPP-NEXT: .LBB2_3:
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB2_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -1357,7 +1251,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1427,7 +1321,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1455,18 +1349,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-NEXT: s_mov_b32 s32, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1474,30 +1368,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB3_5
+; GFX1064-NEXT: s_cbranch_execz .LBB3_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB3_4
-; GFX1064-NEXT: .LBB3_5:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB3_4:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1525,48 +1405,34 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-NEXT: s_mov_b32 s32, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execz .LBB3_5
+; GFX1032-NEXT: s_cbranch_execz .LBB3_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB3_4
-; GFX1032-NEXT: .LBB3_5:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB3_4:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -1584,13 +1450,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_mov_b32 s32, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
@@ -1598,7 +1464,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1607,32 +1473,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB3_5
+; GFX1164-NEXT: s_cbranch_execz .LBB3_4
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB3_4
-; GFX1164-NEXT: .LBB3_5:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB3_4:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -1650,13 +1500,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1664,39 +1514,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execz .LBB3_5
+; GFX1132-NEXT: s_cbranch_execz .LBB3_4
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB3_4
-; GFX1132-NEXT: .LBB3_5:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB3_4:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1789,7 +1624,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: .LBB3_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1858,30 +1693,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2
-; GFX1064-DPP-NEXT: .LBB3_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT: .LBB3_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1940,32 +1761,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2
-; GFX1032-DPP-NEXT: .LBB3_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT: .LBB3_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -2030,37 +1837,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2
-; GFX1164-DPP-NEXT: .LBB3_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB3_2:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -2113,378 +1904,26 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2
-; GFX1132-DPP-NEXT: .LBB3_3:
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB3_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
ret void
}
-
-define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3
-; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
-; GFX7LESS-NEXT: .LBB4_3:
-; GFX7LESS-NEXT: s_endpgm
-;
-; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB4_3
-; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB4_2
-; GFX9-NEXT: .LBB4_3:
-; GFX9-NEXT: s_endpgm
-;
-; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX1064: ; %bb.0:
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB4_3
-; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1064-NEXT: .LBB4_3:
-; GFX1064-NEXT: s_endpgm
-;
-; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB4_3
-; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1032-NEXT: .LBB4_3:
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX1164: ; %bb.0:
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB4_3
-; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1164-NEXT: .LBB4_3:
-; GFX1164-NEXT: s_endpgm
-;
-; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX1132: ; %bb.0:
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB4_3
-; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1132-NEXT: .LBB4_3:
-; GFX1132-NEXT: s_endpgm
-;
-; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX9-DPP-NEXT: .LBB4_3:
-; GFX9-DPP-NEXT: s_endpgm
-;
-; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1064-DPP-NEXT: .LBB4_3:
-; GFX1064-DPP-NEXT: s_endpgm
-;
-; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1032-DPP-NEXT: .LBB4_3:
-; GFX1032-DPP-NEXT: s_endpgm
-;
-; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1164-DPP-NEXT: .LBB4_3:
-; GFX1164-DPP-NEXT: s_endpgm
-;
-; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
-; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1132-DPP-NEXT: .LBB4_3:
-; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -2518,7 +1957,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
@@ -2532,11 +1971,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2566,7 +2005,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0xff800000
-; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX9-NEXT: .LBB4_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
@@ -2576,14 +2015,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
-; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT: s_cbranch_scc1 .LBB4_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execz .LBB5_5
+; GFX9-NEXT: s_cbranch_execz .LBB4_5
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
@@ -2591,7 +2030,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start
+; GFX9-NEXT: .LBB4_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
@@ -2602,11 +2041,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB5_4
-; GFX9-NEXT: .LBB5_5:
+; GFX9-NEXT: s_cbranch_execnz .LBB4_4
+; GFX9-NEXT: .LBB4_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2634,49 +2073,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-NEXT: s_mov_b32 s32, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
-; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1064-NEXT: .LBB4_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2
-; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB4_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064-NEXT: s_cbranch_execz .LBB4_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB5_4
-; GFX1064-NEXT: .LBB5_5:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB4_4:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2704,48 +2129,34 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-NEXT: s_mov_b32 s32, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
-; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1032-NEXT: .LBB4_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2
-; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB4_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execz .LBB5_5
+; GFX1032-NEXT: s_cbranch_execz .LBB4_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB5_4
-; GFX1032-NEXT: .LBB5_5:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB4_4:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -2763,13 +2174,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-NEXT: s_mov_b32 s32, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1164-NEXT: .LBB4_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
@@ -2777,8 +2188,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2
-; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB4_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
@@ -2786,32 +2197,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164-NEXT: s_cbranch_execz .LBB4_4
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB5_4
-; GFX1164-NEXT: .LBB5_5:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB4_4:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -2829,13 +2224,13 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0xff800000
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1132-NEXT: .LBB4_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2843,39 +2238,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2
-; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB4_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execz .LBB5_5
+; GFX1132-NEXT: s_cbranch_execz .LBB4_4
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB5_4
-; GFX1132-NEXT: .LBB5_5:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB4_4:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2945,7 +2325,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
@@ -2953,7 +2333,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
@@ -2964,11 +2344,11 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX9-DPP-NEXT: .LBB5_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3037,30 +2417,16 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX1064-DPP-NEXT: .LBB5_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1]
+; GFX1064-DPP-NEXT: .LBB4_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3119,32 +2485,18 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX1032-DPP-NEXT: .LBB5_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v1, v0, s[0:1]
+; GFX1032-DPP-NEXT: .LBB4_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -3209,37 +2561,21 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX1164-DPP-NEXT: .LBB5_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB4_2:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -3292,48 +2628,32 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX1132-DPP-NEXT: .LBB5_3:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB4_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB5_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -3344,7 +2664,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -3360,17 +2680,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2
-; GFX7LESS-NEXT: .LBB6_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_2
+; GFX7LESS-NEXT: .LBB5_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-NEXT: s_cbranch_execz .LBB5_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -3380,7 +2700,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -3391,17 +2711,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB6_2
-; GFX9-NEXT: .LBB6_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB5_2
+; GFX9-NEXT: .LBB5_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064-NEXT: s_cbranch_execz .LBB5_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -3409,15 +2729,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
-; GFX1064-NEXT: .LBB6_2:
+; GFX1064-NEXT: .LBB5_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032-NEXT: s_cbranch_execz .LBB5_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
@@ -3425,17 +2745,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
-; GFX1032-NEXT: .LBB6_2:
+; GFX1032-NEXT: .LBB5_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-NEXT: s_cbranch_execz .LBB5_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
@@ -3445,7 +2765,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3458,18 +2778,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1164-NEXT: .LBB6_3:
+; GFX1164-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1164-NEXT: .LBB5_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-NEXT: s_cbranch_execz .LBB5_3
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
@@ -3477,7 +2797,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3489,17 +2809,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1132-NEXT: .LBB6_3:
+; GFX1132-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1132-NEXT: .LBB5_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -3509,7 +2829,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -3520,17 +2840,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX9-DPP-NEXT: .LBB6_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX9-DPP-NEXT: .LBB5_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
@@ -3538,15 +2858,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
-; GFX1064-DPP-NEXT: .LBB6_2:
+; GFX1064-DPP-NEXT: .LBB5_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
@@ -3554,17 +2874,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
-; GFX1032-DPP-NEXT: .LBB6_2:
+; GFX1032-DPP-NEXT: .LBB5_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -3574,7 +2894,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3587,18 +2907,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1164-DPP-NEXT: .LBB6_3:
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1164-DPP-NEXT: .LBB5_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -3606,7 +2926,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3618,15 +2938,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1132-DPP-NEXT: .LBB6_3:
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1132-DPP-NEXT: .LBB5_3:
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -3660,7 +2980,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3677,11 +2997,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3713,7 +3033,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3725,11 +3045,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB7_1
+; GFX9-NEXT: s_cbranch_execnz .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3761,7 +3081,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3793,7 +3113,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -3815,7 +3135,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3829,11 +3149,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1164-NEXT: s_cbranch_execnz .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -3854,7 +3174,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3867,11 +3187,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1132-NEXT: s_cbranch_execnz .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3903,7 +3223,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3915,11 +3235,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3951,7 +3271,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3983,7 +3303,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -4005,7 +3325,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4019,11 +3339,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -4044,7 +3364,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4057,22 +3377,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -4083,7 +3403,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -4099,17 +3419,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2
-; GFX7LESS-NEXT: .LBB8_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2
+; GFX7LESS-NEXT: .LBB7_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-NEXT: s_cbranch_execz .LBB7_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -4119,7 +3439,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -4130,79 +3450,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB8_2
-; GFX9-NEXT: .LBB8_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-NEXT: s_cbranch_execz .LBB7_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1064-NEXT: .LBB8_3:
+; GFX1064-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: .LBB7_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB7_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1032-NEXT: .LBB8_3:
+; GFX1032-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: .LBB7_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
@@ -4212,7 +3504,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4225,18 +3517,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1164-NEXT: .LBB8_3:
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-NEXT: s_cbranch_execz .LBB7_3
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
@@ -4244,7 +3536,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4256,17 +3548,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1132-NEXT: .LBB8_3:
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -4276,7 +3568,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -4287,79 +3579,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX9-DPP-NEXT: .LBB8_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1064-DPP-NEXT: .LBB8_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB7_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1032-DPP-NEXT: .LBB8_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB7_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -4369,7 +3633,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4382,18 +3646,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1164-DPP-NEXT: .LBB8_3:
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -4401,7 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4413,15 +3677,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1132-DPP-NEXT: .LBB8_3:
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -4455,7 +3719,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4472,11 +3736,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4508,7 +3772,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4520,11 +3784,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB9_1
+; GFX9-NEXT: s_cbranch_execnz .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4553,26 +3817,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4601,26 +3849,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -4642,7 +3874,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4656,11 +3888,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1164-NEXT: s_cbranch_execnz .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -4681,7 +3913,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4694,11 +3926,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1132-NEXT: s_cbranch_execnz .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4730,7 +3962,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4742,11 +3974,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4775,26 +4007,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4823,26 +4039,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -4864,7 +4064,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4878,11 +4078,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -4903,7 +4103,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4916,22 +4116,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -4942,7 +4142,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -4958,17 +4158,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2
-; GFX7LESS-NEXT: .LBB10_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2
+; GFX7LESS-NEXT: .LBB9_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -4978,7 +4178,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -4989,79 +4189,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB10_2
-; GFX9-NEXT: .LBB10_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1064-NEXT: .LBB10_3:
+; GFX1064-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: .LBB9_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1032-NEXT: .LBB10_3:
+; GFX1032-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: .LBB9_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
@@ -5071,7 +4243,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5084,18 +4256,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1164-NEXT: .LBB10_3:
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-NEXT: .LBB9_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
@@ -5103,7 +4275,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5115,17 +4287,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1132-NEXT: .LBB10_3:
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -5135,7 +4307,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
@@ -5146,79 +4318,51 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX9-DPP-NEXT: .LBB10_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1064-DPP-NEXT: .LBB10_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB9_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1032-DPP-NEXT: .LBB10_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB9_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -5228,7 +4372,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5241,18 +4385,18 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1164-DPP-NEXT: .LBB10_3:
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -5260,7 +4404,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5272,15 +4416,15 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1132-DPP-NEXT: .LBB10_3:
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 8
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 8, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -5314,7 +4458,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5331,11 +4475,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5367,7 +4511,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5379,11 +4523,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB11_1
+; GFX9-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5412,26 +4556,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5460,26 +4588,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -5501,7 +4613,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5515,11 +4627,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -5540,7 +4652,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5553,11 +4665,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5589,7 +4701,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5601,11 +4713,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5634,26 +4746,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5682,26 +4778,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: global_atomic_fmax_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -5723,7 +4803,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5737,11 +4817,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -5762,7 +4842,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5775,11 +4855,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 8, !amdgpu.no.fine.grained.memory !1
ret void
}
@@ -5790,7 +4870,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -5800,7 +4880,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
@@ -5813,8 +4893,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2
-; GFX7LESS-NEXT: .LBB12_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2
+; GFX7LESS-NEXT: .LBB11_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5823,7 +4903,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB12_3
+; GFX9-NEXT: s_cbranch_execz .LBB11_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -5832,7 +4912,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
@@ -5842,8 +4922,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB12_2
-; GFX9-NEXT: .LBB12_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5852,55 +4932,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB12_3
+; GFX1064-NEXT: s_cbranch_execz .LBB11_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1064-NEXT: .LBB12_3:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB11_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB12_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB11_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1032-NEXT: .LBB12_3:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB11_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5910,60 +4964,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB12_3
+; GFX1164-NEXT: s_cbranch_execz .LBB11_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1164-NEXT: .LBB12_3:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB11_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB12_3
+; GFX1132-NEXT: s_cbranch_execz .LBB11_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1132-NEXT: .LBB12_3:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB11_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5972,7 +4999,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -5981,7 +5008,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
@@ -5991,8 +5018,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX9-DPP-NEXT: .LBB12_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6001,55 +5028,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1064-DPP-NEXT: .LBB12_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB11_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1032-DPP-NEXT: .LBB12_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB11_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6059,60 +5060,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1164-DPP-NEXT: .LBB12_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB11_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1132-DPP-NEXT: .LBB12_3:
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB11_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
ret void
@@ -6125,7 +5099,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -6135,7 +5109,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
@@ -6148,8 +5122,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
-; GFX7LESS-NEXT: .LBB13_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2
+; GFX7LESS-NEXT: .LBB12_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6158,7 +5132,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB13_3
+; GFX9-NEXT: s_cbranch_execz .LBB12_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -6167,7 +5141,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
@@ -6177,8 +5151,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB13_2
-; GFX9-NEXT: .LBB13_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB12_2
+; GFX9-NEXT: .LBB12_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6187,55 +5161,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB13_3
+; GFX1064-NEXT: s_cbranch_execz .LBB12_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1064-NEXT: .LBB13_3:
+; GFX1064-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB12_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB13_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB12_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1032-NEXT: .LBB13_3:
+; GFX1032-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB12_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6245,60 +5193,33 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB13_3
+; GFX1164-NEXT: s_cbranch_execz .LBB12_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1164-NEXT: .LBB13_3:
+; GFX1164-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB12_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB13_3
+; GFX1132-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1132-NEXT: .LBB13_3:
+; GFX1132-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB12_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6307,7 +5228,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -6316,7 +5237,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
@@ -6326,8 +5247,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX9-DPP-NEXT: .LBB13_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6336,55 +5257,29 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1064-DPP-NEXT: .LBB13_3:
+; GFX1064-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB12_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1032-DPP-NEXT: .LBB13_3:
+; GFX1032-DPP-NEXT: global_atomic_fmax v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB12_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6394,66 +5289,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1164-DPP-NEXT: .LBB13_3:
+; GFX1164-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB12_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1132-DPP-NEXT: .LBB13_3:
+; GFX1132-DPP-NEXT: global_atomic_max_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB12_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
ret void
}
-attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9f49ad1508d5..8b9d39baeec14 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -15,8 +15,8 @@
declare float @div.float.value()
declare float @div.double.value()
-define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -49,7 +49,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX7LESS-NEXT: .LBB0_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -78,7 +78,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -94,7 +94,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -109,7 +109,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
@@ -128,7 +128,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
@@ -145,7 +145,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -174,7 +174,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -190,7 +190,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: .LBB0_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -205,7 +205,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1032-DPP-NEXT: .LBB0_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
@@ -224,7 +224,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
@@ -240,12 +240,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: s_nop 0
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -297,7 +297,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -367,7 +367,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -423,7 +423,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_4:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -478,7 +478,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-NEXT: .LBB1_4:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -528,7 +528,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: .LBB1_4:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -577,7 +577,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -670,7 +670,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: .LBB1_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -748,7 +748,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: .LBB1_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -818,7 +818,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: .LBB1_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -897,7 +897,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: .LBB1_2:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -964,12 +964,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: .LBB1_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
@@ -1002,7 +1002,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX7LESS-NEXT: .LBB2_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -1031,127 +1031,74 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB2_3
+; GFX1064-NEXT: s_cbranch_execz .LBB2_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1064-NEXT: .LBB2_3:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB2_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB2_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB2_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1032-NEXT: .LBB2_3:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB2_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB2_3
+; GFX1164-NEXT: s_cbranch_execz .LBB2_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1164-NEXT: .LBB2_3:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB2_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB2_3
+; GFX1132-NEXT: s_cbranch_execz .LBB2_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1132-NEXT: .LBB2_3:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB2_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -1180,132 +1127,79 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1064-DPP-NEXT: .LBB2_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB2_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1032-DPP-NEXT: .LBB2_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB2_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1164-DPP-NEXT: .LBB2_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB2_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
-; GFX1132-DPP-NEXT: .LBB2_3:
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB2_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -1357,7 +1251,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1427,7 +1321,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1455,18 +1349,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-NEXT: s_mov_b32 s32, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1474,30 +1368,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB3_5
+; GFX1064-NEXT: s_cbranch_execz .LBB3_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB3_4
-; GFX1064-NEXT: .LBB3_5:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB3_4:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1525,48 +1405,34 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-NEXT: s_mov_b32 s32, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execz .LBB3_5
+; GFX1032-NEXT: s_cbranch_execz .LBB3_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB3_4
-; GFX1032-NEXT: .LBB3_5:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB3_4:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -1584,13 +1450,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_mov_b32 s32, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
@@ -1598,7 +1464,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1607,32 +1473,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB3_5
+; GFX1164-NEXT: s_cbranch_execz .LBB3_4
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB3_4
-; GFX1164-NEXT: .LBB3_5:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB3_4:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -1650,13 +1500,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -1664,39 +1514,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execz .LBB3_5
+; GFX1132-NEXT: s_cbranch_execz .LBB3_4
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB3_4
-; GFX1132-NEXT: .LBB3_5:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB3_4:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1789,7 +1624,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: .LBB3_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1858,30 +1693,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2
-; GFX1064-DPP-NEXT: .LBB3_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT: .LBB3_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -1940,32 +1761,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2
-; GFX1032-DPP-NEXT: .LBB3_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT: .LBB3_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -2030,37 +1837,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2
-; GFX1164-DPP-NEXT: .LBB3_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB3_2:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -2113,378 +1904,26 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2
-; GFX1132-DPP-NEXT: .LBB3_3:
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB3_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
ret void
}
-
-define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX7LESS: ; %bb.0:
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3
-; GFX7LESS-NEXT: ; %bb.1:
-; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
-; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
-; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
-; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
-; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
-; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
-; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
-; GFX7LESS-NEXT: .LBB4_3:
-; GFX7LESS-NEXT: s_endpgm
-;
-; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB4_3
-; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB4_2
-; GFX9-NEXT: .LBB4_3:
-; GFX9-NEXT: s_endpgm
-;
-; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX1064: ; %bb.0:
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB4_3
-; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1064-NEXT: .LBB4_3:
-; GFX1064-NEXT: s_endpgm
-;
-; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB4_3
-; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1032-NEXT: .LBB4_3:
-; GFX1032-NEXT: s_endpgm
-;
-; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX1164: ; %bb.0:
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB4_3
-; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1164-NEXT: .LBB4_3:
-; GFX1164-NEXT: s_endpgm
-;
-; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX1132: ; %bb.0:
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB4_3
-; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1132-NEXT: .LBB4_3:
-; GFX1132-NEXT: s_endpgm
-;
-; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX9-DPP-NEXT: .LBB4_3:
-; GFX9-DPP-NEXT: s_endpgm
-;
-; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1064-DPP-NEXT: .LBB4_3:
-; GFX1064-DPP-NEXT: s_endpgm
-;
-; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1032-DPP-NEXT: .LBB4_3:
-; GFX1032-DPP-NEXT: s_endpgm
-;
-; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1164-DPP-NEXT: .LBB4_3:
-; GFX1164-DPP-NEXT: s_endpgm
-;
-; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
-; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
-; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
-; GFX1132-DPP-NEXT: .LBB4_3:
-; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -2518,7 +1957,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
@@ -2532,11 +1971,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2566,7 +2005,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: s_mov_b64 s[0:1], exec
; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000
-; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX9-NEXT: .LBB4_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
@@ -2576,14 +2015,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
-; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX9-NEXT: s_cbranch_scc1 .LBB4_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execz .LBB5_5
+; GFX9-NEXT: s_cbranch_execz .LBB4_5
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
@@ -2591,7 +2030,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start
+; GFX9-NEXT: .LBB4_4: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
@@ -2602,11 +2041,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB5_4
-; GFX9-NEXT: .LBB5_5:
+; GFX9-NEXT: s_cbranch_execnz .LBB4_4
+; GFX9-NEXT: .LBB4_5:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2634,49 +2073,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-NEXT: s_mov_b32 s32, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1064-NEXT: s_mov_b64 s[0:1], exec
-; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1064-NEXT: .LBB4_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_min_f32_e32 v2, v1, v2
-; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1064-NEXT: s_cbranch_scc1 .LBB4_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execz .LBB5_5
+; GFX1064-NEXT: s_cbranch_execz .LBB4_4
; GFX1064-NEXT: ; %bb.3:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB5_4
-; GFX1064-NEXT: .LBB5_5:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB4_4:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2704,48 +2129,34 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-NEXT: s_mov_b32 s32, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1032-NEXT: s_mov_b32 s0, exec_lo
-; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1032-NEXT: .LBB4_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
-; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1032-NEXT: v_min_f32_e32 v2, v1, v2
-; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1032-NEXT: s_cbranch_scc1 .LBB4_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execz .LBB5_5
+; GFX1032-NEXT: s_cbranch_execz .LBB4_4
; GFX1032-NEXT: ; %bb.3:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
-; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB5_4
-; GFX1032-NEXT: .LBB5_5:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB4_4:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-NEXT: s_mov_b32 s14, s8
@@ -2763,13 +2174,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-NEXT: s_mov_b32 s32, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1164-NEXT: .LBB4_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
@@ -2777,8 +2188,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_min_f32_e32 v2, v1, v2
-; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-NEXT: s_cbranch_scc1 .LBB4_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
@@ -2786,32 +2197,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execz .LBB5_5
+; GFX1164-NEXT: s_cbranch_execz .LBB4_4
; GFX1164-NEXT: ; %bb.3:
; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB5_4
-; GFX1164-NEXT: .LBB5_5:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB4_4:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-NEXT: v_mov_b32_e32 v31, v0
@@ -2829,13 +2224,13 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: s_mov_b32 s32, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop
+; GFX1132-NEXT: .LBB4_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
+; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2843,39 +2238,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: v_min_f32_e32 v2, v1, v2
-; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1132-NEXT: s_cbranch_scc1 .LBB4_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execz .LBB5_5
+; GFX1132-NEXT: s_cbranch_execz .LBB4_4
; GFX1132-NEXT: ; %bb.3:
; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
-; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB5_4
-; GFX1132-NEXT: .LBB5_5:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB4_4:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -2945,7 +2325,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
@@ -2953,7 +2333,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
@@ -2964,11 +2344,11 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX9-DPP-NEXT: .LBB5_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
+; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3037,30 +2417,16 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, v0, v6
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX1064-DPP-NEXT: .LBB5_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1]
+; GFX1064-DPP-NEXT: .LBB4_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3119,32 +2485,18 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
-; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, v0, v6
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX1032-DPP-NEXT: .LBB5_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v1, v0, s[0:1]
+; GFX1032-DPP-NEXT: .LBB4_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
@@ -3209,37 +2561,21 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v6
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX1164-DPP-NEXT: .LBB5_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1164-DPP-NEXT: .LBB4_2:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
@@ -3292,48 +2628,32 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_2
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
-; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v6
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2
-; GFX1132-DPP-NEXT: .LBB5_3:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v4, v0, s[0:1]
+; GFX1132-DPP-NEXT: .LBB4_2:
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB5_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -3344,7 +2664,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -3360,17 +2680,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2
-; GFX7LESS-NEXT: .LBB6_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_2
+; GFX7LESS-NEXT: .LBB5_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-NEXT: s_cbranch_execz .LBB5_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -3380,7 +2700,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -3391,17 +2711,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB6_2
-; GFX9-NEXT: .LBB6_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB5_2
+; GFX9-NEXT: .LBB5_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064-NEXT: s_cbranch_execz .LBB5_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
@@ -3409,15 +2729,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
-; GFX1064-NEXT: .LBB6_2:
+; GFX1064-NEXT: .LBB5_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032-NEXT: s_cbranch_execz .LBB5_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
@@ -3425,17 +2745,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
-; GFX1032-NEXT: .LBB6_2:
+; GFX1032-NEXT: .LBB5_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-NEXT: s_cbranch_execz .LBB5_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
@@ -3445,7 +2765,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3458,18 +2778,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1164-NEXT: .LBB6_3:
+; GFX1164-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1164-NEXT: .LBB5_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-NEXT: s_cbranch_execz .LBB5_3
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
@@ -3477,7 +2797,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3489,17 +2809,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1132-NEXT: .LBB6_3:
+; GFX1132-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1132-NEXT: .LBB5_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -3509,7 +2829,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -3520,17 +2840,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX9-DPP-NEXT: .LBB6_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX9-DPP-NEXT: .LBB5_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
@@ -3538,15 +2858,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
-; GFX1064-DPP-NEXT: .LBB6_2:
+; GFX1064-DPP-NEXT: .LBB5_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_2
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
@@ -3554,17 +2874,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
-; GFX1032-DPP-NEXT: .LBB6_2:
+; GFX1032-DPP-NEXT: .LBB5_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -3574,7 +2894,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3587,18 +2907,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1164-DPP-NEXT: .LBB6_3:
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1164-DPP-NEXT: .LBB5_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -3606,7 +2926,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3618,15 +2938,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
-; GFX1132-DPP-NEXT: .LBB6_3:
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2
+; GFX1132-DPP-NEXT: .LBB5_3:
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -3660,7 +2980,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3677,11 +2997,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3713,7 +3033,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3725,11 +3045,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB7_1
+; GFX9-NEXT: s_cbranch_execnz .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3761,7 +3081,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3793,7 +3113,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -3815,7 +3135,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3829,11 +3149,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1164-NEXT: s_cbranch_execnz .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -3854,7 +3174,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3867,11 +3187,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1132-NEXT: s_cbranch_execnz .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3903,7 +3223,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -3915,11 +3235,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3951,7 +3271,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -3983,7 +3303,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -4005,7 +3325,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4019,11 +3339,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -4044,7 +3364,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4057,22 +3377,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -4083,7 +3403,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -4099,17 +3419,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2
-; GFX7LESS-NEXT: .LBB8_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2
+; GFX7LESS-NEXT: .LBB7_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-NEXT: s_cbranch_execz .LBB7_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -4119,7 +3439,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -4130,79 +3450,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB8_2
-; GFX9-NEXT: .LBB8_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-NEXT: s_cbranch_execz .LBB7_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1064-NEXT: .LBB8_3:
+; GFX1064-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: .LBB7_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB7_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1032-NEXT: .LBB8_3:
+; GFX1032-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: .LBB7_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
@@ -4212,7 +3504,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4225,18 +3517,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1164-NEXT: .LBB8_3:
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-NEXT: s_cbranch_execz .LBB7_3
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
@@ -4244,7 +3536,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4256,17 +3548,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1132-NEXT: .LBB8_3:
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -4276,7 +3568,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -4287,79 +3579,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX9-DPP-NEXT: .LBB8_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1064-DPP-NEXT: .LBB8_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB7_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1032-DPP-NEXT: .LBB8_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB7_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -4369,7 +3633,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4382,18 +3646,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1164-DPP-NEXT: .LBB8_3:
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -4401,7 +3665,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4413,15 +3677,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
-; GFX1132-DPP-NEXT: .LBB8_3:
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
+; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -4455,7 +3719,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4472,11 +3736,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4508,7 +3772,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4520,11 +3784,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB9_1
+; GFX9-NEXT: s_cbranch_execnz .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4553,26 +3817,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4601,26 +3849,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -4642,7 +3874,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4656,11 +3888,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1164-NEXT: s_cbranch_execnz .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -4681,7 +3913,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4694,11 +3926,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1132-NEXT: s_cbranch_execnz .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4730,7 +3962,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4742,11 +3974,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4775,26 +4007,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -4823,26 +4039,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -4864,7 +4064,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4878,11 +4078,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -4903,7 +4103,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -4916,22 +4116,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -4942,7 +4142,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -4958,17 +4158,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2
-; GFX7LESS-NEXT: .LBB10_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2
+; GFX7LESS-NEXT: .LBB9_3:
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -4978,7 +4178,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -4989,79 +4189,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB10_2
-; GFX9-NEXT: .LBB10_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1064-NEXT: .LBB10_3:
+; GFX1064-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: .LBB9_2:
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1032-NEXT: .LBB10_3:
+; GFX1032-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: .LBB9_2:
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
@@ -5071,7 +4243,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5084,18 +4256,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1164-NEXT: .LBB10_3:
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-NEXT: .LBB9_3:
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
@@ -5103,7 +4275,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5115,17 +4287,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1132-NEXT: .LBB10_3:
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -5135,7 +4307,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
@@ -5146,79 +4318,51 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX9-DPP-NEXT: .LBB10_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1064-DPP-NEXT: .LBB10_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1064-DPP-NEXT: .LBB9_2:
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
-; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1032-DPP-NEXT: .LBB10_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v2, v[0:1], s[0:1]
+; GFX1032-DPP-NEXT: .LBB9_2:
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -5228,7 +4372,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5241,18 +4385,18 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1164-DPP-NEXT: .LBB10_3:
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -5260,7 +4404,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5272,15 +4416,15 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
-; GFX1132-DPP-NEXT: .LBB10_3:
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 8
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 8, !amdgpu.no.fine.grained.memory !1
ret void
}
-define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe(ptr addrspace(1) %ptr) #0 {
-; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX7LESS: ; %bb.0:
; GFX7LESS-NEXT: s_mov_b32 s32, 0
; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
@@ -5314,7 +4458,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5331,11 +4475,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1
; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7LESS-NEXT: s_endpgm
;
-; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5367,7 +4511,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5379,11 +4523,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-NEXT: s_cbranch_execnz .LBB11_1
+; GFX9-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
;
-; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5412,26 +4556,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-NEXT: s_endpgm
;
-; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5460,26 +4588,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-NEXT: s_mov_b32 s0, 0
-; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-NEXT: s_endpgm
;
-; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164: ; %bb.0:
; GFX1164-NEXT: s_mov_b32 s14, s8
; GFX1164-NEXT: s_add_u32 s8, s2, 44
@@ -5501,7 +4613,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5515,11 +4627,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-NEXT: s_endpgm
;
-; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: s_add_u32 s8, s2, 44
; GFX1132-NEXT: s_addc_u32 s9, s3, 0
@@ -5540,7 +4652,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-NEXT: s_mov_b32 s0, 0
-; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5553,11 +4665,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-NEXT: s_endpgm
;
-; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX9-DPP: ; %bb.0:
; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5589,7 +4701,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5601,11 +4713,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-DPP-NEXT: s_endpgm
;
-; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1064-DPP: ; %bb.0:
; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5634,26 +4746,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1064-DPP-NEXT: s_endpgm
;
-; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
@@ -5682,26 +4778,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
-; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
-; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
-; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: global_atomic_fmin_x2 v40, v[0:1], s[34:35]
; GFX1032-DPP-NEXT: s_endpgm
;
-; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1164-DPP: ; %bb.0:
; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
@@ -5723,7 +4803,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
-; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5737,11 +4817,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1164-DPP-NEXT: s_endpgm
;
-; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope__amdgpu_no_fine_grained_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
@@ -5762,7 +4842,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
-; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
@@ -5775,11 +4855,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call double @div.double.value()
- %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 8, !amdgpu.no.fine.grained.memory !1
ret void
}
@@ -5790,7 +4870,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -5800,7 +4880,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0
@@ -5813,8 +4893,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2
-; GFX7LESS-NEXT: .LBB12_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2
+; GFX7LESS-NEXT: .LBB11_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5823,7 +4903,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB12_3
+; GFX9-NEXT: s_cbranch_execz .LBB11_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -5832,7 +4912,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
@@ -5842,8 +4922,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB12_2
-; GFX9-NEXT: .LBB12_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5852,55 +4932,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB12_3
+; GFX1064-NEXT: s_cbranch_execz .LBB11_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1064-NEXT: .LBB12_3:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB11_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB12_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB11_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1032-NEXT: .LBB12_3:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB11_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5910,60 +4964,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB12_3
+; GFX1164-NEXT: s_cbranch_execz .LBB11_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1164-NEXT: .LBB12_3:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB11_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB12_3
+; GFX1132-NEXT: s_cbranch_execz .LBB11_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1132-NEXT: .LBB12_3:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB11_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -5972,7 +4999,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -5981,7 +5008,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
@@ -5991,8 +5018,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX9-DPP-NEXT: .LBB12_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6001,55 +5028,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1064-DPP-NEXT: .LBB12_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB11_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1032-DPP-NEXT: .LBB12_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB11_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6059,60 +5060,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1164-DPP-NEXT: .LBB12_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB11_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
-; GFX1132-DPP-NEXT: .LBB12_3:
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB11_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
ret void
@@ -6125,7 +5099,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3
+; GFX7LESS-NEXT: s_cbranch_execz .LBB12_3
; GFX7LESS-NEXT: ; %bb.1:
; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
@@ -6135,7 +5109,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0
@@ -6148,8 +5122,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
-; GFX7LESS-NEXT: .LBB13_3:
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2
+; GFX7LESS-NEXT: .LBB12_3:
; GFX7LESS-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6158,7 +5132,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT: s_cbranch_execz .LBB13_3
+; GFX9-NEXT: s_cbranch_execz .LBB12_3
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[2:3], 0
@@ -6167,7 +5141,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
@@ -6177,8 +5151,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_execnz .LBB13_2
-; GFX9-NEXT: .LBB13_3:
+; GFX9-NEXT: s_cbranch_execnz .LBB12_2
+; GFX9-NEXT: .LBB12_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6187,55 +5161,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT: s_cbranch_execz .LBB13_3
+; GFX1064-NEXT: s_cbranch_execz .LBB12_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1064-NEXT: .LBB13_3:
+; GFX1064-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-NEXT: .LBB12_2:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT: s_cbranch_execz .LBB13_3
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB12_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1032-NEXT: .LBB13_3:
+; GFX1032-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-NEXT: .LBB12_2:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6245,60 +5193,33 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT: s_cbranch_execz .LBB13_3
+; GFX1164-NEXT: s_cbranch_execz .LBB12_2
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1164-NEXT: .LBB13_3:
+; GFX1164-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-NEXT: .LBB12_2:
+; GFX1164-NEXT: s_nop 0
+; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT: s_cbranch_execz .LBB13_3
+; GFX1132-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1132-NEXT: .LBB13_3:
+; GFX1132-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-NEXT: .LBB12_2:
+; GFX1132-NEXT: s_nop 0
+; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6307,7 +5228,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX9-DPP-NEXT: ; %bb.1:
; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
@@ -6316,7 +5237,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
@@ -6326,8 +5247,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX9-DPP-NEXT: .LBB13_3:
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
+; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6336,55 +5257,29 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_2
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1064-DPP-NEXT: .LBB13_3:
+; GFX1064-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1064-DPP-NEXT: .LBB12_2:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_2
; GFX1032-DPP-NEXT: ; %bb.1:
; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
-; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1032-DPP-NEXT: .LBB13_3:
+; GFX1032-DPP-NEXT: global_atomic_fmin v0, v1, s[0:1]
+; GFX1032-DPP-NEXT: .LBB12_2:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -6394,66 +5289,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_2
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
-; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1164-DPP-NEXT: .LBB13_3:
+; GFX1164-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1164-DPP-NEXT: .LBB12_2:
+; GFX1164-DPP-NEXT: s_nop 0
+; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
-; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
-; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
-; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
-; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
-; GFX1132-DPP-NEXT: .LBB13_3:
+; GFX1132-DPP-NEXT: global_atomic_min_f32 v0, v1, s[0:1]
+; GFX1132-DPP-NEXT: .LBB12_2:
+; GFX1132-DPP-NEXT: s_nop 0
+; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
ret void
}
-attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index 7290a91c9ccd2..b0d2824a64ee3 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -3345,176 +3345,936 @@ define float @test_atomicrmw_fsub_f32_global_agent__amdgpu_ignore_denormal_mode_
;---------------------------------------------------------------------
define float @test_atomicrmw_fmax_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
@@ -3525,176 +4285,936 @@ define float @test_atomicrmw_fmax_f32_global_agent__amdgpu_ignore_denormal_mode_
;---------------------------------------------------------------------
define float @test_atomicrmw_fmin_f32_global_agent(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4
+; GFX12-NEXT: ret float [[RES]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
+ ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[RES]]
;
- %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst
- ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[RES]]
;
- %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
- ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret float %res
}
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
}
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[TMP6]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[TMP6]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
index 05fb224e6f145..c5bf26dc4c0e1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll
@@ -3367,66 +3367,339 @@ define float @test_atomicrmw_fmax_f32_global_system(ptr addrspace(1) %ptr, float
}
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
@@ -3455,66 +3728,339 @@ define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode
}
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmax_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
@@ -3542,71 +4088,344 @@ define float @test_atomicrmw_fmin_f32_global_system(ptr addrspace(1) %ptr, float
; COMMON: atomicrmw.end:
; COMMON-NEXT: ret float [[TMP6]]
;
- %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
- ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst
+ ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
+;
+ %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %res
+}
+
+define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[RES]]
;
- %res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0
- ret float %res
-}
-
-define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.remote.memory !0
ret float %res
}
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret float %res
@@ -3635,66 +4454,339 @@ define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode
}
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
}
define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %value) {
-; COMMON-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret float [[TMP6]]
+; GFX803-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret float [[TMP6]]
+;
+; GFX906-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret float [[TMP6]]
+;
+; GFX908-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret float [[TMP6]]
+;
+; GFX90A-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret float [[TMP6]]
+;
+; GFX940-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR]], align 4
+; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX940: atomicrmw.start:
+; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX940-NEXT: [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4
+; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX940: atomicrmw.end:
+; GFX940-NEXT: ret float [[TMP6]]
+;
+; GFX10-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX10-NEXT: ret float [[RES]]
+;
+; GFX11-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX11-NEXT: ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_atomicrmw_fmin_f32_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], float [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT: ret float [[RES]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, float %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
index af6b7e0addfb1..ee360dee79425 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll
@@ -1241,88 +1241,468 @@ define double @test_atomicrmw_fsub_f64_global_agent__amdgpu_ignore_denormal_mode
;---------------------------------------------------------------------
define double @test_atomicrmw_fmax_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret double [[TMP6]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret double [[TMP6]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
ret double %res
}
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
}
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret double %res
}
define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
@@ -1457,88 +1837,468 @@ define double @test_atomicrmw_fmax_f64_global_agent__amdgpu_ignore_denormal_mode
;---------------------------------------------------------------------
define double @test_atomicrmw_fmin_f64_global_agent(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret double [[TMP6]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret double [[TMP6]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst
ret double %res
}
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
}
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret double %res
}
define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
index 69d65e6f1f379..ac5dd55002f3f 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll
@@ -1263,66 +1263,339 @@ define double @test_atomicrmw_fmax_f64_global_system(ptr addrspace(1) %ptr, doub
}
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
}
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
ret double %res
}
define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmax_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmax ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
@@ -1479,66 +1752,339 @@ define double @test_atomicrmw_fmin_f64_global_system(ptr addrspace(1) %ptr, doub
}
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0
ret double %res
}
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX90A: atomicrmw.start:
+; GFX90A-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX90A-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX90A-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX90A-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX90A: atomicrmw.end:
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX10: atomicrmw.start:
+; GFX10-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ]
+; GFX10-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX10-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX10-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX10-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX10-NEXT: [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX10: atomicrmw.end:
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.remote.memory !0
ret double %res
}
define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, double %value) {
-; COMMON-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
-; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
-; COMMON-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
-; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
-; COMMON: atomicrmw.start:
-; COMMON-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
-; COMMON-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
-; COMMON-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
-; COMMON-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
-; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; COMMON-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
-; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; COMMON: atomicrmw.end:
-; COMMON-NEXT: ret double [[TMP6]]
+; GFX803-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX803-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX803: atomicrmw.start:
+; GFX803-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX803-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX803-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX803-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX803-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX803-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX803: atomicrmw.end:
+; GFX803-NEXT: ret double [[TMP6]]
+;
+; GFX906-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX906-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX906: atomicrmw.start:
+; GFX906-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX906-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX906-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX906-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX906-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX906-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX906: atomicrmw.end:
+; GFX906-NEXT: ret double [[TMP6]]
+;
+; GFX908-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP6]]
+;
+; GFX90A-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX90A-NEXT: ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX940-NEXT: ret double [[RES]]
+;
+; GFX10-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX10-NEXT: [[RES:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR]], double [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; GFX10-NEXT: ret double [[RES]]
+;
+; GFX11-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX11-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP6]]
+;
+; GFX12-LABEL: define double @test_atomicrmw_fmin_f64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR]], align 8
+; GFX12-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX12: atomicrmw.start:
+; GFX12-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX12-NEXT: [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX12-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8
+; GFX12-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX12-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX12-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX12: atomicrmw.end:
+; GFX12-NEXT: ret double [[TMP6]]
;
%res = atomicrmw fmin ptr addrspace(1) %ptr, double %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret double %res
More information about the llvm-branch-commits
mailing list